From 322359ca8121ca8e52b1c85ecdda945047999c42 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> Date: Mon, 8 Mar 2021 12:18:37 -0600 Subject: [PATCH] kernel v5.12-rc2 rebase with: device-tree/etc BBDTBS: https://github.com/beagleboard/BeagleBoard-DeviceTrees/commit/eb25f2800649020dc1537f2e2f5c0f6e977344d6 TI_AMX3_CM3: http://git.ti.com/gitweb/?p=processor-firmware/ti-amx3-cm3-pm-firmware.git;a=commit;h=6a849767df85ce9399494f53fb5c753665396653 WPANUSB: https://github.com/statropy/wpanusb/commit/7ba5f3d24d95f804e80b2d8d28e35b34c15219c2 Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- patches/defconfig | 2 +- patches/ref_omap2plus_defconfig | 2 +- ...01-merge-CONFIG_PREEMPT_RT-Patch-Set.patch | 17876 ++++++---------- ...001-Add-BeagleBoard.org-DTBS-v5.12.x.patch | 4 +- ...sb-https-github.com-statropy-wpanusb.patch | 4 +- version.sh | 4 +- 6 files changed, 6826 insertions(+), 11066 deletions(-) diff --git a/patches/defconfig b/patches/defconfig index f736755a7..4c9d1003c 100644 --- a/patches/defconfig +++ b/patches/defconfig @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 5.12.0-rc1 Kernel Configuration +# Linux/arm 5.12.0-rc2 Kernel Configuration # CONFIG_CC_VERSION_TEXT="arm-none-linux-gnueabihf-gcc (GNU Toolchain for the A-profile Architecture 10.2-2020.11 (arm-10.16)) 10.2.1 20201103" CONFIG_CC_IS_GCC=y diff --git a/patches/ref_omap2plus_defconfig b/patches/ref_omap2plus_defconfig index 3842da1ec..104770f4b 100644 --- a/patches/ref_omap2plus_defconfig +++ b/patches/ref_omap2plus_defconfig @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 5.12.0-rc1 Kernel Configuration +# Linux/arm 5.12.0-rc2 Kernel Configuration # CONFIG_CC_VERSION_TEXT="arm-none-linux-gnueabihf-gcc (GNU Toolchain for the A-profile Architecture 10.2-2020.11 (arm-10.16)) 10.2.1 20201103" CONFIG_CC_IS_GCC=y diff --git a/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch b/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch index 1639484e6..12f6b9e69 100644 --- a/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch +++ b/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch @@ -1,35 +1,33 @@ -From fd74741cc96dd0ea170a8a18a2c8801da9e704b2 Mon Sep 17 00:00:00 2001 +From cff641c4fc9d0b3cd3d1a7041277527581de4807 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Fri, 23 Oct 2020 10:38:09 -0500 +Date: Thu, 4 Mar 2021 11:48:09 -0600 Subject: [PATCH] merge: CONFIG_PREEMPT_RT Patch Set -patch-5.9.1-rt18.patch.xz +patch-5.11.2-rt9.patch.xz Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- - Documentation/RCU/checklist.rst | 4 +- + .../Expedited-Grace-Periods.rst | 4 +- + .../RCU/Design/Requirements/Requirements.rst | 26 +- + Documentation/RCU/checklist.rst | 2 +- + Documentation/RCU/rcubarrier.rst | 6 +- Documentation/RCU/stallwarn.rst | 4 +- - Documentation/admin-guide/kdump/gdbmacros.txt | 159 +- - .../admin-guide/kdump/vmcoreinfo.rst | 131 +- - Documentation/locking/seqlock.rst | 18 + - MAINTAINERS | 1 + - arch/Kconfig | 8 + + Documentation/RCU/whatisRCU.rst | 10 +- + .../admin-guide/kernel-parameters.txt | 11 + + arch/Kconfig | 1 + arch/alpha/include/asm/spinlock_types.h | 4 - arch/arm/Kconfig | 5 +- arch/arm/include/asm/spinlock_types.h | 4 - - arch/arm/include/asm/switch_to.h | 8 + - arch/arm/include/asm/thread_info.h | 8 +- + arch/arm/include/asm/thread_info.h | 6 +- arch/arm/kernel/asm-offsets.c | 1 + arch/arm/kernel/entry-armv.S | 19 +- - arch/arm/kernel/entry-common.S | 9 +- arch/arm/kernel/signal.c | 3 +- arch/arm/kernel/smp.c | 2 - arch/arm/mm/fault.c | 6 + - arch/arm/mm/highmem.c | 55 +- arch/arm64/Kconfig | 3 + arch/arm64/include/asm/preempt.h | 28 +- arch/arm64/include/asm/spinlock_types.h | 4 - - arch/arm64/include/asm/thread_info.h | 6 +- + arch/arm64/include/asm/thread_info.h | 8 +- arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 13 +- arch/arm64/kernel/fpsimd.c | 14 +- @@ -37,54 +35,57 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> arch/arm64/kvm/arm.c | 6 +- arch/hexagon/include/asm/spinlock_types.h | 4 - arch/ia64/include/asm/spinlock_types.h | 4 - - arch/mips/Kconfig | 2 +- - arch/powerpc/Kconfig | 5 +- + arch/powerpc/Kconfig | 3 + + arch/powerpc/include/asm/cmpxchg.h | 2 +- + .../include/asm/simple_spinlock_types.h | 2 +- arch/powerpc/include/asm/spinlock_types.h | 4 - arch/powerpc/include/asm/stackprotector.h | 4 + - arch/powerpc/include/asm/thread_info.h | 16 +- + arch/powerpc/include/asm/thread_info.h | 15 +- arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/entry_32.S | 23 +- arch/powerpc/kernel/exceptions-64e.S | 16 +- arch/powerpc/kernel/irq.c | 2 + arch/powerpc/kernel/misc_32.S | 2 + arch/powerpc/kernel/misc_64.S | 2 + - arch/powerpc/kernel/syscall_64.c | 9 +- + arch/powerpc/kernel/nvram_64.c | 12 +- + arch/powerpc/kernel/syscall_64.c | 10 +- arch/powerpc/kernel/traps.c | 8 +- arch/powerpc/kernel/watchdog.c | 5 - arch/powerpc/kexec/crash.c | 3 - arch/powerpc/kvm/Kconfig | 1 + - arch/powerpc/platforms/pseries/iommu.c | 16 +- + arch/powerpc/mm/mem.c | 2 +- + arch/powerpc/platforms/powernv/opal-kmsg.c | 3 +- + arch/powerpc/platforms/pseries/iommu.c | 31 +- + arch/powerpc/xmon/xmon.c | 6 +- arch/s390/include/asm/spinlock_types.h | 4 - arch/sh/include/asm/spinlock_types.h | 4 - arch/sh/kernel/irq.c | 2 + arch/sparc/kernel/irq_64.c | 2 + + arch/um/kernel/kmsg_dump.c | 13 +- arch/x86/Kconfig | 2 + arch/x86/crypto/aesni-intel_glue.c | 22 +- arch/x86/crypto/cast5_avx_glue.c | 21 +- arch/x86/crypto/glue_helper.c | 26 +- - arch/x86/include/asm/fpu/api.h | 11 +- + arch/x86/include/asm/fpu/api.h | 1 + arch/x86/include/asm/preempt.h | 36 +- arch/x86/include/asm/signal.h | 13 + arch/x86/include/asm/stackprotector.h | 8 +- - arch/x86/include/asm/thread_info.h | 11 + + arch/x86/include/asm/thread_info.h | 7 + arch/x86/kernel/cpu/mshyperv.c | 3 +- arch/x86/kernel/fpu/core.c | 12 + arch/x86/kernel/irq_32.c | 2 + arch/x86/kernel/irq_64.c | 2 + - arch/x86/kernel/process_32.c | 32 + - arch/x86/kernel/tsc.c | 10 +- arch/x86/kvm/x86.c | 8 + - arch/x86/mm/highmem_32.c | 10 +- - arch/x86/mm/iomap_32.c | 15 +- arch/xtensa/include/asm/spinlock_types.h | 4 - - block/blk-mq.c | 8 +- + block/blk-mq.c | 115 +- crypto/cryptd.c | 19 +- - drivers/base/core.c | 46 +- - drivers/block/zram/zram_drv.c | 41 +- + drivers/atm/eni.c | 2 +- + drivers/block/zram/zram_drv.c | 36 + drivers/block/zram/zram_drv.h | 1 + drivers/char/random.c | 11 +- drivers/char/tpm/tpm-dev-common.c | 1 - drivers/char/tpm/tpm_tis.c | 29 +- + drivers/firewire/ohci.c | 4 +- drivers/firmware/efi/efi.c | 5 +- drivers/gpu/drm/i915/display/intel_sprite.c | 15 +- drivers/gpu/drm/i915/gt/intel_engine_pm.c | 8 +- @@ -92,10 +93,21 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> drivers/gpu/drm/i915/i915_trace.h | 6 +- drivers/gpu/drm/radeon/radeon_display.c | 2 + drivers/hv/hyperv_vmbus.h | 1 + - drivers/hv/vmbus_drv.c | 5 +- + drivers/hv/vmbus_drv.c | 10 +- drivers/leds/trigger/Kconfig | 1 + drivers/md/raid5.c | 7 +- drivers/md/raid5.h | 1 + + drivers/mtd/mtdoops.c | 5 +- + drivers/net/ethernet/chelsio/cxgb/common.h | 6 +- + drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 54 +- + drivers/net/ethernet/chelsio/cxgb/sge.c | 53 +- + drivers/net/ethernet/chelsio/cxgb/sge.h | 3 +- + drivers/net/ethernet/chelsio/cxgb/subr.c | 64 +- + drivers/net/ethernet/dlink/sundance.c | 2 +- + drivers/net/ethernet/jme.c | 10 +- + drivers/net/ethernet/jme.h | 2 +- + drivers/net/wireless/ath/ath9k/beacon.c | 2 +- + drivers/pci/controller/pci-hyperv.c | 2 +- drivers/scsi/fcoe/fcoe.c | 16 +- drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- drivers/scsi/libfc/fc_exch.c | 4 +- @@ -110,62 +122,56 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> fs/afs/dir_silly.c | 2 +- fs/cifs/readdir.c | 2 +- fs/dcache.c | 39 +- - fs/exec.c | 17 +- fs/fuse/readdir.c | 2 +- fs/inode.c | 2 +- - fs/io-wq.c | 52 +- fs/namei.c | 4 +- fs/namespace.c | 8 +- fs/nfs/dir.c | 4 +- fs/nfs/unlink.c | 4 +- - fs/proc/array.c | 4 +- fs/proc/base.c | 3 +- fs/proc/proc_sysctl.c | 2 +- + fs/pstore/platform.c | 5 +- include/asm-generic/preempt.h | 3 + + include/linux/blkdev.h | 2 +- include/linux/bottom_half.h | 8 +- - include/linux/console.h | 7 + - include/linux/cpuhotplug.h | 1 + - include/linux/cpumask.h | 6 + - include/linux/crash_core.h | 3 + + include/linux/console.h | 11 + include/linux/dcache.h | 4 +- include/linux/debug_locks.h | 3 +- - include/linux/delay.h | 6 + - include/linux/dev_printk.h | 8 + include/linux/entry-common.h | 2 +- include/linux/fs.h | 2 +- include/linux/hardirq.h | 3 +- - include/linux/highmem.h | 34 +- - include/linux/interrupt.h | 16 +- + include/linux/highmem-internal.h | 27 +- + include/linux/hrtimer.h | 6 + + include/linux/interrupt.h | 35 +- include/linux/irq_work.h | 6 + include/linux/irqdesc.h | 1 + include/linux/irqflags.h | 23 +- + include/linux/kcov.h | 1 + include/linux/kernel.h | 5 + - include/linux/local_lock_internal.h | 118 +- - include/linux/mhi.h | 3 +- + include/linux/kmsg_dump.h | 52 +- + include/linux/local_lock_internal.h | 126 +- include/linux/mm_types.h | 4 + include/linux/mutex.h | 34 +- - include/linux/mutex_rt.h | 131 + + include/linux/mutex_rt.h | 130 ++ include/linux/nfs_xdr.h | 2 +- + include/linux/notifier.h | 6 +- include/linux/pid.h | 1 + - include/linux/preempt.h | 160 +- - include/linux/printk.h | 26 +- + include/linux/preempt.h | 85 +- + include/linux/printk.h | 30 +- include/linux/random.h | 2 +- include/linux/rbtree.h | 27 +- - include/linux/rbtree_latch.h | 6 +- include/linux/rbtree_type.h | 31 + include/linux/rcupdate.h | 10 +- - include/linux/rtmutex.h | 47 +- - include/linux/rwlock_rt.h | 109 + + include/linux/rtmutex.h | 46 +- + include/linux/rwlock_rt.h | 109 ++ include/linux/rwlock_types.h | 4 + include/linux/rwlock_types_rt.h | 56 + - include/linux/rwsem-rt.h | 69 + + include/linux/rwsem-rt.h | 70 + include/linux/rwsem.h | 12 + - include/linux/sched.h | 117 +- - include/linux/sched/hotplug.h | 2 + + include/linux/sched.h | 106 +- include/linux/sched/mm.h | 11 + include/linux/sched/rt.h | 8 - include/linux/sched/wake_q.h | 13 +- - include/linux/seqlock.h | 368 ++- include/linux/serial_8250.h | 5 + include/linux/shmem_fs.h | 2 +- include/linux/signal.h | 1 + @@ -178,119 +184,125 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> include/linux/spinlock_types_nort.h | 39 + include/linux/spinlock_types_raw.h | 65 + include/linux/spinlock_types_rt.h | 38 + - include/linux/spinlock_types_up.h | 4 - - include/linux/stop_machine.h | 5 + + include/linux/spinlock_types_up.h | 2 +- include/linux/thread_info.h | 12 +- - include/linux/trace_events.h | 2 + + include/linux/trace_events.h | 76 +- include/linux/u64_stats_sync.h | 42 +- include/linux/vmstat.h | 4 + include/linux/wait.h | 1 + include/linux/ww_mutex.h | 8 + + include/linux/zpool.h | 3 + include/net/gen_stats.h | 11 +- include/net/net_seq_lock.h | 15 + include/net/sch_generic.h | 27 +- - include/trace/events/sched.h | 12 + - init/Kconfig | 8 +- + init/Kconfig | 5 +- kernel/Kconfig.locks | 2 +- - kernel/Kconfig.preempt | 6 + + kernel/Kconfig.preempt | 7 + kernel/cgroup/cpuset.c | 70 +- kernel/cgroup/rstat.c | 5 +- - kernel/cpu.c | 9 +- + kernel/debug/kdb/kdb_main.c | 10 +- kernel/entry/common.c | 12 +- kernel/exit.c | 2 +- kernel/fork.c | 27 +- kernel/futex.c | 88 +- kernel/irq/handle.c | 8 +- - kernel/irq/manage.c | 8 +- + kernel/irq/manage.c | 12 +- kernel/irq/spurious.c | 8 + - kernel/irq_work.c | 58 +- + kernel/irq_work.c | 69 +- kernel/kexec_core.c | 1 - kernel/ksysfs.c | 12 + + kernel/kthread.c | 16 +- kernel/locking/Makefile | 10 +- kernel/locking/lockdep.c | 2 + - kernel/locking/mutex-rt.c | 222 ++ - kernel/locking/rtmutex-debug.c | 102 - + kernel/locking/mutex-rt.c | 224 +++ + kernel/locking/rtmutex-debug.c | 102 -- kernel/locking/rtmutex-debug.h | 11 - - kernel/locking/rtmutex.c | 936 ++++++- + kernel/locking/rtmutex.c | 939 +++++++++-- kernel/locking/rtmutex.h | 7 - kernel/locking/rtmutex_common.h | 36 +- - kernel/locking/rwlock-rt.c | 334 +++ - kernel/locking/rwsem-rt.c | 292 +++ + kernel/locking/rwlock-rt.c | 334 ++++ + kernel/locking/rwsem-rt.c | 318 ++++ kernel/locking/rwsem.c | 6 + kernel/locking/spinlock.c | 7 + kernel/locking/spinlock_debug.c | 5 + - kernel/panic.c | 5 +- - kernel/printk/Makefile | 2 +- + kernel/notifier.c | 12 +- + kernel/panic.c | 33 +- + kernel/printk/Makefile | 1 - kernel/printk/internal.h | 74 - - kernel/printk/printk.c | 2266 +++++++++-------- - kernel/printk/printk_ringbuffer.c | 2086 +++++++++++++++ - kernel/printk/printk_ringbuffer.h | 382 +++ - kernel/printk/printk_safe.c | 414 --- - kernel/ptrace.c | 9 +- + kernel/printk/printk.c | 1459 +++++++++-------- + kernel/printk/printk_safe.c | 414 ----- + kernel/ptrace.c | 32 +- kernel/rcu/Kconfig | 4 +- kernel/rcu/rcutorture.c | 97 +- kernel/rcu/tree.c | 4 +- kernel/rcu/update.c | 4 +- - kernel/sched/core.c | 1083 ++++++-- - kernel/sched/cpudeadline.c | 4 +- - kernel/sched/cpupri.c | 4 +- - kernel/sched/deadline.c | 47 +- + kernel/sched/core.c | 226 ++- + kernel/sched/cputime.c | 4 +- kernel/sched/fair.c | 16 +- kernel/sched/features.h | 8 + - kernel/sched/rt.c | 81 +- - kernel/sched/sched.h | 69 +- + kernel/sched/sched.h | 10 + kernel/sched/swait.c | 1 + - kernel/sched/topology.c | 1 + - kernel/signal.c | 105 +- - kernel/softirq.c | 372 ++- - kernel/stop_machine.c | 23 +- - kernel/time/hrtimer.c | 32 +- - kernel/time/sched_clock.c | 6 +- + kernel/sched/topology.c | 3 +- + kernel/signal.c | 103 +- + kernel/smp.c | 14 + + kernel/softirq.c | 309 +++- + kernel/time/hrtimer.c | 30 + kernel/time/tick-sched.c | 2 +- - kernel/time/timekeeping.c | 10 +- - kernel/time/timer.c | 2 + - kernel/trace/trace.c | 53 +- - kernel/trace/trace.h | 2 + - kernel/trace/trace_events.c | 2 + + kernel/time/timer.c | 8 +- + kernel/trace/blktrace.c | 17 +- + kernel/trace/trace.c | 231 +-- + kernel/trace/trace.h | 57 +- + kernel/trace/trace_branch.c | 6 +- + kernel/trace/trace_event_perf.c | 5 +- + kernel/trace/trace_events.c | 20 +- + kernel/trace/trace_events_inject.c | 6 +- + kernel/trace/trace_functions.c | 28 +- + kernel/trace/trace_functions_graph.c | 32 +- + kernel/trace/trace_hwlat.c | 7 +- + kernel/trace/trace_irqsoff.c | 86 +- + kernel/trace/trace_kprobe.c | 10 +- + kernel/trace/trace_mmiotrace.c | 14 +- kernel/trace/trace_output.c | 19 +- - kernel/workqueue.c | 4 + + kernel/trace/trace_sched_wakeup.c | 71 +- + kernel/trace/trace_syscalls.c | 20 +- + kernel/trace/trace_uprobe.c | 4 +- lib/Kconfig.debug | 2 +- - lib/cpumask.c | 18 + + lib/bug.c | 1 + lib/debugobjects.c | 5 +- - lib/dump_stack.c | 2 + lib/irq_poll.c | 5 + - lib/locking-selftest.c | 50 + + lib/locking-selftest.c | 51 + lib/nmi_backtrace.c | 6 - lib/scatterlist.c | 2 +- - lib/smp_processor_id.c | 5 + mm/Kconfig | 2 +- - mm/highmem.c | 5 +- - mm/memcontrol.c | 64 +- - mm/page_alloc.c | 180 +- + mm/memcontrol.c | 66 +- + mm/page_alloc.c | 53 +- mm/shmem.c | 31 +- mm/slab.c | 90 +- mm/slab.h | 2 +- - mm/slub.c | 145 +- - mm/swap.c | 65 +- + mm/slub.c | 249 ++- mm/vmalloc.c | 13 +- mm/vmstat.c | 12 + mm/workingset.c | 5 +- + mm/z3fold.c | 1 + + mm/zbud.c | 1 + + mm/zpool.c | 13 + mm/zsmalloc.c | 85 +- - mm/zswap.c | 41 +- + mm/zswap.c | 51 +- net/Kconfig | 2 +- net/core/dev.c | 33 +- net/core/gen_estimator.c | 6 +- net/core/gen_stats.c | 12 +- + net/core/skbuff.c | 1 + net/core/sock.c | 6 +- net/ipv4/inet_hashtables.c | 19 +- net/ipv6/inet6_hashtables.c | 5 +- + net/mac80211/iface.c | 1 + + net/mac80211/rx.c | 1 + net/sched/sch_api.c | 2 +- net/sched/sch_generic.c | 10 + net/sunrpc/svc_xprt.c | 4 +- net/xfrm/xfrm_state.c | 9 +- - scripts/gdb/linux/dmesg.py | 147 +- - scripts/gdb/linux/utils.py | 7 + - 283 files changed, 10833 insertions(+), 3450 deletions(-) + 295 files changed, 6753 insertions(+), 3061 deletions(-) create mode 100644 include/linux/mutex_rt.h create mode 100644 include/linux/rbtree_type.h create mode 100644 include/linux/rwlock_rt.h @@ -305,25 +317,168 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> create mode 100644 kernel/locking/rwlock-rt.c create mode 100644 kernel/locking/rwsem-rt.c delete mode 100644 kernel/printk/internal.h - create mode 100644 kernel/printk/printk_ringbuffer.c - create mode 100644 kernel/printk/printk_ringbuffer.h delete mode 100644 kernel/printk/printk_safe.c +diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +index 72f0f6fbd53c..6f89cf1e567d 100644 +--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst ++++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +@@ -38,7 +38,7 @@ sections. + RCU-preempt Expedited Grace Periods + =================================== + +-``CONFIG_PREEMPT=y`` kernels implement RCU-preempt. ++``CONFIG_PREEMPTION=y`` kernels implement RCU-preempt. + The overall flow of the handling of a given CPU by an RCU-preempt + expedited grace period is shown in the following diagram: + +@@ -112,7 +112,7 @@ things. + RCU-sched Expedited Grace Periods + --------------------------------- + +-``CONFIG_PREEMPT=n`` kernels implement RCU-sched. The overall flow of ++``CONFIG_PREEMPTION=n`` kernels implement RCU-sched. The overall flow of + the handling of a given CPU by an RCU-sched expedited grace period is + shown in the following diagram: + +diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst +index d4c9a016074b..4bb0aa36d4de 100644 +--- a/Documentation/RCU/Design/Requirements/Requirements.rst ++++ b/Documentation/RCU/Design/Requirements/Requirements.rst +@@ -78,7 +78,7 @@ RCU treats a nested set as one big RCU read-side critical section. + Production-quality implementations of ``rcu_read_lock()`` and + ``rcu_read_unlock()`` are extremely lightweight, and in fact have + exactly zero overhead in Linux kernels built for production use with +-``CONFIG_PREEMPT=n``. ++``CONFIG_PREEMPTION=n``. + + This guarantee allows ordering to be enforced with extremely low + overhead to readers, for example: +@@ -1182,7 +1182,7 @@ and has become decreasingly so as memory sizes have expanded and memory + costs have plummeted. However, as I learned from Matt Mackall's + `bloatwatch <http://elinux.org/Linux_Tiny-FAQ>`__ efforts, memory + footprint is critically important on single-CPU systems with +-non-preemptible (``CONFIG_PREEMPT=n``) kernels, and thus `tiny ++non-preemptible (``CONFIG_PREEMPTION=n``) kernels, and thus `tiny + RCU <https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com>`__ + was born. Josh Triplett has since taken over the small-memory banner + with his `Linux kernel tinification <https://tiny.wiki.kernel.org/>`__ +@@ -1498,7 +1498,7 @@ limitations. + + Implementations of RCU for which ``rcu_read_lock()`` and + ``rcu_read_unlock()`` generate no code, such as Linux-kernel RCU when +-``CONFIG_PREEMPT=n``, can be nested arbitrarily deeply. After all, there ++``CONFIG_PREEMPTION=n``, can be nested arbitrarily deeply. After all, there + is no overhead. Except that if all these instances of + ``rcu_read_lock()`` and ``rcu_read_unlock()`` are visible to the + compiler, compilation will eventually fail due to exhausting memory, +@@ -1771,7 +1771,7 @@ implementation can be a no-op. + + However, once the scheduler has spawned its first kthread, this early + boot trick fails for ``synchronize_rcu()`` (as well as for +-``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPT=y`` kernels. The ++``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPTION=y`` kernels. The + reason is that an RCU read-side critical section might be preempted, + which means that a subsequent ``synchronize_rcu()`` really does have to + wait for something, as opposed to simply returning immediately. +@@ -2040,7 +2040,7 @@ the following: + 5 rcu_read_unlock(); + 6 do_something_with(v, user_v); + +-If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel ++If the compiler did make this transformation in a ``CONFIG_PREEMPTION=n`` kernel + build, and if ``get_user()`` did page fault, the result would be a quiescent + state in the middle of an RCU read-side critical section. This misplaced + quiescent state could result in line 4 being a use-after-free access, +@@ -2319,10 +2319,10 @@ decides to throw at it. + + The Linux kernel is used for real-time workloads, especially in + conjunction with the `-rt +-patchset <https://rt.wiki.kernel.org/index.php/Main_Page>`__. The ++patchset <https://wiki.linuxfoundation.org/realtime/>`__. The + real-time-latency response requirements are such that the traditional + approach of disabling preemption across RCU read-side critical sections +-is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use ++is inappropriate. Kernels built with ``CONFIG_PREEMPTION=y`` therefore use + an RCU implementation that allows RCU read-side critical sections to be + preempted. This requirement made its presence known after users made it + clear that an earlier `real-time +@@ -2444,7 +2444,7 @@ includes ``rcu_read_lock_bh()``, ``rcu_read_unlock_bh()``, + ``call_rcu_bh()``, ``rcu_barrier_bh()``, and + ``rcu_read_lock_bh_held()``. However, the update-side APIs are now + simple wrappers for other RCU flavors, namely RCU-sched in +-CONFIG_PREEMPT=n kernels and RCU-preempt otherwise. ++CONFIG_PREEMPTION=n kernels and RCU-preempt otherwise. + + Sched Flavor (Historical) + ~~~~~~~~~~~~~~~~~~~~~~~~~ +@@ -2462,11 +2462,11 @@ not have this property, given that any point in the code outside of an + RCU read-side critical section can be a quiescent state. Therefore, + *RCU-sched* was created, which follows “classic†RCU in that an + RCU-sched grace period waits for pre-existing interrupt and NMI +-handlers. In kernels built with ``CONFIG_PREEMPT=n``, the RCU and ++handlers. In kernels built with ``CONFIG_PREEMPTION=n``, the RCU and + RCU-sched APIs have identical implementations, while kernels built with +-``CONFIG_PREEMPT=y`` provide a separate implementation for each. ++``CONFIG_PREEMPTION=y`` provide a separate implementation for each. + +-Note well that in ``CONFIG_PREEMPT=y`` kernels, ++Note well that in ``CONFIG_PREEMPTION=y`` kernels, + ``rcu_read_lock_sched()`` and ``rcu_read_unlock_sched()`` disable and + re-enable preemption, respectively. This means that if there was a + preemption attempt during the RCU-sched read-side critical section, +@@ -2629,10 +2629,10 @@ userspace execution also delimit tasks-RCU read-side critical sections. + + The tasks-RCU API is quite compact, consisting only of + ``call_rcu_tasks()``, ``synchronize_rcu_tasks()``, and +-``rcu_barrier_tasks()``. In ``CONFIG_PREEMPT=n`` kernels, trampolines ++``rcu_barrier_tasks()``. In ``CONFIG_PREEMPTION=n`` kernels, trampolines + cannot be preempted, so these APIs map to ``call_rcu()``, + ``synchronize_rcu()``, and ``rcu_barrier()``, respectively. In +-``CONFIG_PREEMPT=y`` kernels, trampolines can be preempted, and these ++``CONFIG_PREEMPTION=y`` kernels, trampolines can be preempted, and these + three APIs are therefore implemented by separate functions that check + for voluntary context switches. + diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst -index 2efed9926c3f..54a79d03438a 100644 +index bb7128eb322e..a56566549114 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst -@@ -214,8 +214,8 @@ over a rather long period of time, but improvements are always welcome! +@@ -214,7 +214,7 @@ over a rather long period of time, but improvements are always welcome! the rest of the system. 7. As of v4.20, a given kernel implements only one RCU flavor, - which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. -- If the updater uses call_rcu() or synchronize_rcu(), -+ which is RCU-sched for PREEMPTION=n and RCU-preempt for -+ PREEMPTION=y. If the updater uses call_rcu() or synchronize_rcu(), ++ which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. + If the updater uses call_rcu() or synchronize_rcu(), then the corresponding readers my use rcu_read_lock() and rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), - or any pair of primitives that disables and re-enables preemption, +diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst +index f64f4413a47c..3b4a24877496 100644 +--- a/Documentation/RCU/rcubarrier.rst ++++ b/Documentation/RCU/rcubarrier.rst +@@ -9,7 +9,7 @@ RCU (read-copy update) is a synchronization mechanism that can be thought + of as a replacement for read-writer locking (among other things), but with + very low-overhead readers that are immune to deadlock, priority inversion, + and unbounded latency. RCU read-side critical sections are delimited +-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT ++by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION + kernels, generate no code whatsoever. + + This means that RCU writers are unaware of the presence of concurrent +@@ -329,10 +329,10 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last + to smp_call_function() and further to smp_call_function_on_cpu(), + causing this latter to spin until the cross-CPU invocation of + rcu_barrier_func() has completed. This by itself would prevent +- a grace period from completing on non-CONFIG_PREEMPT kernels, ++ a grace period from completing on non-CONFIG_PREEMPTION kernels, + since each CPU must undergo a context switch (or other quiescent + state) before the grace period can complete. However, this is +- of no use in CONFIG_PREEMPT kernels. ++ of no use in CONFIG_PREEMPTION kernels. + + Therefore, on_each_cpu() disables preemption across its call + to smp_call_function() and also across the local call to diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index c9ab6af4d3be..e97d1b4876ef 100644 --- a/Documentation/RCU/stallwarn.rst @@ -346,414 +501,84 @@ index c9ab6af4d3be..e97d1b4876ef 100644 happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if that low-priority task is not permitted to run on any other CPU, -diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt -index 220d0a80ca2c..82aecdcae8a6 100644 ---- a/Documentation/admin-guide/kdump/gdbmacros.txt -+++ b/Documentation/admin-guide/kdump/gdbmacros.txt -@@ -170,57 +170,103 @@ document trapinfo - address the kernel panicked. - end - --define dump_log_idx -- set $idx = $arg0 -- if ($argc > 1) -- set $prev_flags = $arg1 -+define dump_record -+ set var $desc = $arg0 -+ set var $info = $arg1 -+ if ($argc > 2) -+ set var $prev_flags = $arg2 - else -- set $prev_flags = 0 -+ set var $prev_flags = 0 - end -- set $msg = ((struct printk_log *) (log_buf + $idx)) -- set $prefix = 1 -- set $newline = 1 -- set $log = log_buf + $idx + sizeof(*$msg) -- -- # prev & LOG_CONT && !(msg->flags & LOG_PREIX) -- if (($prev_flags & 8) && !($msg->flags & 4)) -- set $prefix = 0 -+ -+ set var $prefix = 1 -+ set var $newline = 1 -+ -+ set var $begin = $desc->text_blk_lpos.begin % (1U << prb->text_data_ring.size_bits) -+ set var $next = $desc->text_blk_lpos.next % (1U << prb->text_data_ring.size_bits) -+ -+ # handle data-less record -+ if ($begin & 1) -+ set var $text_len = 0 -+ set var $log = "" -+ else -+ # handle wrapping data block -+ if ($begin > $next) -+ set var $begin = 0 -+ end -+ -+ # skip over descriptor id -+ set var $begin = $begin + sizeof(long) -+ -+ # handle truncated message -+ if ($next - $begin < $info->text_len) -+ set var $text_len = $next - $begin -+ else -+ set var $text_len = $info->text_len -+ end -+ -+ set var $log = &prb->text_data_ring.data[$begin] -+ end -+ -+ # prev & LOG_CONT && !(info->flags & LOG_PREIX) -+ if (($prev_flags & 8) && !($info->flags & 4)) -+ set var $prefix = 0 - end - -- # msg->flags & LOG_CONT -- if ($msg->flags & 8) -+ # info->flags & LOG_CONT -+ if ($info->flags & 8) - # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) - if (($prev_flags & 8) && !($prev_flags & 2)) -- set $prefix = 0 -+ set var $prefix = 0 - end -- # (!(msg->flags & LOG_NEWLINE)) -- if (!($msg->flags & 2)) -- set $newline = 0 -+ # (!(info->flags & LOG_NEWLINE)) -+ if (!($info->flags & 2)) -+ set var $newline = 0 - end - end - - if ($prefix) -- printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 -+ printf "[%5lu.%06lu] ", $info->ts_nsec / 1000000000, $info->ts_nsec % 1000000000 - end -- if ($msg->text_len != 0) -- eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len -+ if ($text_len) -+ eval "printf \"%%%d.%ds\", $log", $text_len, $text_len - end - if ($newline) - printf "\n" - end -- if ($msg->dict_len > 0) -- set $dict = $log + $msg->text_len -- set $idx = 0 -- set $line = 1 -- while ($idx < $msg->dict_len) -- if ($line) -- printf " " -- set $line = 0 -+ -+ # handle dictionary data -+ -+ set var $dict = &$info->dev_info.subsystem[0] -+ set var $dict_len = sizeof($info->dev_info.subsystem) -+ if ($dict[0] != '\0') -+ printf " SUBSYSTEM=" -+ set var $idx = 0 -+ while ($idx < $dict_len) -+ set var $c = $dict[$idx] -+ if ($c == '\0') -+ loop_break -+ else -+ if ($c < ' ' || $c >= 127 || $c == '\\') -+ printf "\\x%02x", $c -+ else -+ printf "%c", $c -+ end - end -- set $c = $dict[$idx] -+ set var $idx = $idx + 1 -+ end -+ printf "\n" -+ end -+ -+ set var $dict = &$info->dev_info.device[0] -+ set var $dict_len = sizeof($info->dev_info.device) -+ if ($dict[0] != '\0') -+ printf " DEVICE=" -+ set var $idx = 0 -+ while ($idx < $dict_len) -+ set var $c = $dict[$idx] - if ($c == '\0') -- printf "\n" -- set $line = 1 -+ loop_break - else - if ($c < ' ' || $c >= 127 || $c == '\\') - printf "\\x%02x", $c -@@ -228,33 +274,46 @@ define dump_log_idx - printf "%c", $c - end - end -- set $idx = $idx + 1 -+ set var $idx = $idx + 1 - end - printf "\n" - end - end --document dump_log_idx -- Dump a single log given its index in the log buffer. The first -- parameter is the index into log_buf, the second is optional and -- specified the previous log buffer's flags, used for properly -- formatting continued lines. -+document dump_record -+ Dump a single record. The first parameter is the descriptor, -+ the second parameter is the info, the third parameter is -+ optional and specifies the previous record's flags, used for -+ properly formatting continued lines. - end - - define dmesg -- set $i = log_first_idx -- set $end_idx = log_first_idx -- set $prev_flags = 0 -+ # definitions from kernel/printk/printk_ringbuffer.h -+ set var $desc_committed = 1 -+ set var $desc_finalized = 2 -+ set var $desc_sv_bits = sizeof(long) * 8 -+ set var $desc_flags_shift = $desc_sv_bits - 2 -+ set var $desc_flags_mask = 3 << $desc_flags_shift -+ set var $id_mask = ~$desc_flags_mask -+ -+ set var $desc_count = 1U << prb->desc_ring.count_bits -+ set var $prev_flags = 0 -+ -+ set var $id = prb->desc_ring.tail_id.counter -+ set var $end_id = prb->desc_ring.head_id.counter - - while (1) -- set $msg = ((struct printk_log *) (log_buf + $i)) -- if ($msg->len == 0) -- set $i = 0 -- else -- dump_log_idx $i $prev_flags -- set $i = $i + $msg->len -- set $prev_flags = $msg->flags -+ set var $desc = &prb->desc_ring.descs[$id % $desc_count] -+ set var $info = &prb->desc_ring.infos[$id % $desc_count] -+ -+ # skip non-committed record -+ set var $state = 3 & ($desc->state_var.counter >> $desc_flags_shift) -+ if ($state == $desc_committed || $state == $desc_finalized) -+ dump_record $desc $info $prev_flags -+ set var $prev_flags = $info->flags - end -- if ($i == $end_idx) -+ -+ set var $id = ($id + 1) & $id_mask -+ if ($id == $end_id) - loop_break - end - end -diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst -index 2baad0bfb09d..e44a6c01f336 100644 ---- a/Documentation/admin-guide/kdump/vmcoreinfo.rst -+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst -@@ -189,50 +189,123 @@ from this. - Free areas descriptor. User-space tools use this value to iterate the - free_area ranges. MAX_ORDER is used by the zone buddy allocator. - --log_first_idx -+prb -+--- -+ -+A pointer to the printk ringbuffer (struct printk_ringbuffer). This -+may be pointing to the static boot ringbuffer or the dynamically -+allocated ringbuffer, depending on when the the core dump occurred. -+Used by user-space tools to read the active kernel log buffer. -+ -+printk_rb_static -+---------------- -+ -+A pointer to the static boot printk ringbuffer. If @prb has a -+different value, this is useful for viewing the initial boot messages, -+which may have been overwritten in the dynamically allocated -+ringbuffer. -+ -+clear_seq -+--------- -+ -+The sequence number of the printk() record after the last clear -+command. It indicates the first record after the last -+SYSLOG_ACTION_CLEAR, like issued by 'dmesg -c'. Used by user-space -+tools to dump a subset of the dmesg log. -+ -+printk_ringbuffer -+----------------- -+ -+The size of a printk_ringbuffer structure. This structure contains all -+information required for accessing the various components of the -+kernel log buffer. -+ -+(printk_ringbuffer, desc_ring|text_data_ring|dict_data_ring|fail) -+----------------------------------------------------------------- -+ -+Offsets for the various components of the printk ringbuffer. Used by -+user-space tools to view the kernel log buffer without requiring the -+declaration of the structure. -+ -+prb_desc_ring - ------------- - --Index of the first record stored in the buffer log_buf. Used by --user-space tools to read the strings in the log_buf. -+The size of the prb_desc_ring structure. This structure contains -+information about the set of record descriptors. - --log_buf --------- -+(prb_desc_ring, count_bits|descs|head_id|tail_id) -+------------------------------------------------- -+ -+Offsets for the fields describing the set of record descriptors. Used -+by user-space tools to be able to traverse the descriptors without -+requiring the declaration of the structure. -+ -+prb_desc -+-------- -+ -+The size of the prb_desc structure. This structure contains -+information about a single record descriptor. -+ -+(prb_desc, info|state_var|text_blk_lpos|dict_blk_lpos) -+------------------------------------------------------ -+ -+Offsets for the fields describing a record descriptors. Used by -+user-space tools to be able to read descriptors without requiring -+the declaration of the structure. -+ -+prb_data_blk_lpos -+----------------- -+ -+The size of the prb_data_blk_lpos structure. This structure contains -+information about where the text or dictionary data (data block) is -+located within the respective data ring. -+ -+(prb_data_blk_lpos, begin|next) -+------------------------------- - --Console output is written to the ring buffer log_buf at index --log_first_idx. Used to get the kernel log. -+Offsets for the fields describing the location of a data block. Used -+by user-space tools to be able to locate data blocks without -+requiring the declaration of the structure. - --log_buf_len -+printk_info - ----------- - --log_buf's length. -+The size of the printk_info structure. This structure contains all -+the meta-data for a record. - --clear_idx ----------- -+(printk_info, seq|ts_nsec|text_len|dict_len|caller_id) -+------------------------------------------------------ - --The index that the next printk() record to read after the last clear --command. It indicates the first record after the last SYSLOG_ACTION --_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump --the dmesg log. -+Offsets for the fields providing the meta-data for a record. Used by -+user-space tools to be able to read the information without requiring -+the declaration of the structure. - --log_next_idx -------------- -+prb_data_ring -+------------- - --The index of the next record to store in the buffer log_buf. Used to --compute the index of the current buffer position. -+The size of the prb_data_ring structure. This structure contains -+information about a set of data blocks. - --printk_log ------------ -+(prb_data_ring, size_bits|data|head_lpos|tail_lpos) -+--------------------------------------------------- - --The size of a structure printk_log. Used to compute the size of --messages, and extract dmesg log. It encapsulates header information for --log_buf, such as timestamp, syslog level, etc. -+Offsets for the fields describing a set of data blocks. Used by -+user-space tools to be able to access the data blocks without -+requiring the declaration of the structure. - --(printk_log, ts_nsec|len|text_len|dict_len) --------------------------------------------- -+atomic_long_t -+------------- -+ -+The size of the atomic_long_t structure. Used by user-space tools to -+be able to copy the full structure, regardless of its -+architecture-specific implementation. -+ -+(atomic_long_t, counter) -+------------------------ - --It represents field offsets in struct printk_log. User space tools --parse it and check whether the values of printk_log's members have been --changed. -+Offset for the long value of an atomic_long_t variable. Used by -+user-space tools to access the long value without requiring the -+architecture-specific declaration. - - (free_area.free_list, MIGRATE_TYPES) - ------------------------------------ -diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst -index 62c5ad98c11c..a334b584f2b3 100644 ---- a/Documentation/locking/seqlock.rst -+++ b/Documentation/locking/seqlock.rst -@@ -139,6 +139,24 @@ with the associated LOCKTYPE lock acquired. - - Read path: same as in :ref:`seqcount_t`. - -+ -+.. _seqcount_latch_t: -+ -+Latch sequence counters (``seqcount_latch_t``) -+---------------------------------------------- -+ -+Latch sequence counters are a multiversion concurrency control mechanism -+where the embedded seqcount_t counter even/odd value is used to switch -+between two copies of protected data. This allows the sequence counter -+read path to safely interrupt its own write side critical section. -+ -+Use seqcount_latch_t when the write side sections cannot be protected -+from interruption by readers. This is typically the case when the read -+side can be invoked from NMI handlers. -+ -+Check `raw_write_seqcount_latch()` for more information. -+ -+ - .. _seqlock_t: - - Sequential locks (``seqlock_t``) -diff --git a/MAINTAINERS b/MAINTAINERS -index 867157311dc8..7ae63272d994 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -13960,6 +13960,7 @@ PRINTK - M: Petr Mladek <pmladek@suse.com> - M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> - R: Steven Rostedt <rostedt@goodmis.org> -+R: John Ogness <john.ogness@linutronix.de> - S: Maintained - F: include/linux/printk.h - F: kernel/printk/ +diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst +index 1a4723f48bd9..17e95ab2a201 100644 +--- a/Documentation/RCU/whatisRCU.rst ++++ b/Documentation/RCU/whatisRCU.rst +@@ -683,7 +683,7 @@ Quick Quiz #1: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + This section presents a "toy" RCU implementation that is based on + "classic RCU". It is also short on performance (but only for updates) and +-on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT ++on features such as hotplug CPU and the ability to run in CONFIG_PREEMPTION + kernels. The definitions of rcu_dereference() and rcu_assign_pointer() + are the same as those shown in the preceding section, so they are omitted. + :: +@@ -739,7 +739,7 @@ Quick Quiz #2: + Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in +- PREEMPT_RT, where normal spinlocks can block??? ++ CONFIG_PREEMPT_RT, where normal spinlocks can block??? + + :ref:`Answers to Quick Quiz <8_whatisRCU>` + +@@ -1093,7 +1093,7 @@ Quick Quiz #2: + overhead is **negative**. + + Answer: +- Imagine a single-CPU system with a non-CONFIG_PREEMPT ++ Imagine a single-CPU system with a non-CONFIG_PREEMPTION + kernel where a routing table is used by process-context + code, but can be updated by irq-context code (for example, + by an "ICMP REDIRECT" packet). The usual way of handling +@@ -1120,10 +1120,10 @@ Answer: + Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in +- PREEMPT_RT, where normal spinlocks can block??? ++ CONFIG_PREEMPT_RT, where normal spinlocks can block??? + + Answer: +- Just as PREEMPT_RT permits preemption of spinlock ++ Just as CONFIG_PREEMPT_RT permits preemption of spinlock + critical sections, it permits preemption of RCU + read-side critical sections. It also permits + spinlocks blocking while in RCU read-side critical +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index a10b545c2070..9503320c6652 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4092,6 +4092,10 @@ + value, meaning that RCU_SOFTIRQ is used by default. + Specify rcutree.use_softirq=0 to use rcuc kthreads. + ++ But note that CONFIG_PREEMPT_RT=y kernels disable ++ this kernel boot parameter, forcibly setting it ++ to zero. ++ + rcutree.rcu_fanout_exact= [KNL] + Disable autobalancing of the rcu_node combining + tree. This is used by rcutorture, and might +@@ -4470,6 +4474,13 @@ + only normal grace-period primitives. No effect + on CONFIG_TINY_RCU kernels. + ++ But note that CONFIG_PREEMPT_RT=y kernels enables ++ this kernel boot parameter, forcibly setting ++ it to the value one, that is, converting any ++ post-boot attempt at an expedited RCU grace ++ period to instead use normal non-expedited ++ grace-period processing. ++ + rcupdate.rcu_task_ipi_delay= [KNL] + Set time in jiffies during which RCU tasks will + avoid sending IPIs, starting with the beginning diff --git a/arch/Kconfig b/arch/Kconfig -index af14a567b493..5c8e173dc7c2 100644 +index 24862d15f3a3..90c8f8518bb4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig -@@ -34,6 +34,7 @@ config OPROFILE +@@ -37,6 +37,7 @@ config OPROFILE tristate "OProfile system profiling" depends on PROFILING depends on HAVE_OPROFILE @@ -761,20 +586,6 @@ index af14a567b493..5c8e173dc7c2 100644 select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help -@@ -414,6 +415,13 @@ config MMU_GATHER_NO_GATHER - bool - depends on MMU_GATHER_TABLE_FREE - -+config ARCH_WANT_IRQS_OFF_ACTIVATE_MM -+ bool -+ help -+ Temporary select until all architectures can be converted to have -+ irqs disabled over activate_mm. Architectures that do IPI based TLB -+ shootdowns should enable this. -+ - config ARCH_HAVE_NMI_SAFE_CMPXCHG - bool - diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h index 1d5716bc060b..6883bc952d22 100644 --- a/arch/alpha/include/asm/spinlock_types.h @@ -791,10 +602,10 @@ index 1d5716bc060b..6883bc952d22 100644 volatile unsigned int lock; } arch_spinlock_t; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index e00d94b16658..b7c20565754d 100644 +index 138248999df7..321b8979222d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig -@@ -31,6 +31,7 @@ config ARM +@@ -30,6 +30,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW @@ -802,16 +613,16 @@ index e00d94b16658..b7c20565754d 100644 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU -@@ -64,7 +65,7 @@ config ARM +@@ -66,7 +67,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL select HAVE_ARCH_MMAP_RND_BITS if MMU - select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT -@@ -102,6 +103,7 @@ config ARM +@@ -107,6 +108,7 @@ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP @@ -819,14 +630,14 @@ index e00d94b16658..b7c20565754d 100644 select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ -@@ -117,6 +119,7 @@ config ARM +@@ -122,6 +124,7 @@ config ARM select OLD_SIGSUSPEND3 select PCI_SYSCALL if PCI select PERF_USE_VMALLOC + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select RTC_LIB + select SET_FS select SYS_SUPPORTS_APM_EMULATION - # Above selects are sorted alphabetically; please add new ones diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index 5976958647fe..a37c0803954b 100644 --- a/arch/arm/include/asm/spinlock_types.h @@ -842,37 +653,11 @@ index 5976958647fe..a37c0803954b 100644 #define TICKET_SHIFT 16 typedef struct { -diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h -index 007d8fea7157..285e6248454f 100644 ---- a/arch/arm/include/asm/switch_to.h -+++ b/arch/arm/include/asm/switch_to.h -@@ -4,6 +4,13 @@ - - #include <linux/thread_info.h> - -+#if defined CONFIG_PREEMPT_RT && defined CONFIG_HIGHMEM -+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p); -+#else -+static inline void -+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { } -+#endif -+ - /* - * For v7 SMP cores running a preemptible kernel we may be pre-empted - * during a TLB maintenance operation, so execute an inner-shareable dsb -@@ -26,6 +33,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info - #define switch_to(prev,next,last) \ - do { \ - __complete_pending_tlbi(); \ -+ switch_kmaps(prev, next); \ - last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \ - } while (0) - diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h -index 536b6b979f63..875aaf9af946 100644 +index 70d4cbc49ae1..b86418b4dfef 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h -@@ -46,6 +46,7 @@ struct cpu_context_save { +@@ -54,6 +54,7 @@ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ @@ -880,39 +665,37 @@ index 536b6b979f63..875aaf9af946 100644 mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ -@@ -134,7 +135,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - #define TIF_SYSCALL_TRACE 4 /* syscall trace active */ - #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ +@@ -146,6 +147,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ --#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ -+#define TIF_NEED_RESCHED_LAZY 7 -+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ + #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ + #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 9 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ -@@ -143,6 +145,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) - #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) - #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +@@ -160,6 +162,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) + #define _TIF_SECCOMP (1 << TIF_SECCOMP) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) - #define _TIF_UPROBE (1 << TIF_UPROBE) - #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) - #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -@@ -158,7 +161,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) + + /* Checks for any syscall work in entry-common.S */ +@@ -169,7 +172,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + /* * Change these and you break ASM code in entry-common.S */ - #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ -- _TIF_NOTIFY_RESUME | _TIF_UPROBE) -+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ -+ _TIF_NEED_RESCHED_LAZY) +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NOTIFY_SIGNAL) - #endif /* __KERNEL__ */ - #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c -index a1570c8bab25..88a0eb048ca1 100644 +index be8050b0c3df..884e40a525ce 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c -@@ -41,6 +41,7 @@ int main(void) +@@ -42,6 +42,7 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); @@ -921,7 +704,7 @@ index a1570c8bab25..88a0eb048ca1 100644 DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S -index 55a47df04773..1e689a727cb9 100644 +index 0ea8529a4872..fa0d155d21b3 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -206,11 +206,18 @@ __irq_svc: @@ -961,36 +744,8 @@ index 55a47df04773..1e689a727cb9 100644 #endif __und_fault: -diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S -index 271cb8a1eba1..fd039b1b3731 100644 ---- a/arch/arm/kernel/entry-common.S -+++ b/arch/arm/kernel/entry-common.S -@@ -53,7 +53,9 @@ __ret_fast_syscall: - cmp r2, #TASK_SIZE - blne addr_limit_check_failed - ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing -- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK -+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) -+ bne fast_work_pending -+ tst r1, #_TIF_SECCOMP - bne fast_work_pending - - -@@ -90,8 +92,11 @@ __ret_fast_syscall: - cmp r2, #TASK_SIZE - blne addr_limit_check_failed - ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing -- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK -+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) -+ bne do_slower_path -+ tst r1, #_TIF_SECCOMP - beq no_work_pending -+do_slower_path: - UNWIND(.fnend ) - ENDPROC(ret_fast_syscall) - diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c -index c1892f733f20..fb8c02954901 100644 +index a3a38d0a4c85..f04ccf19ab1f 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -649,7 +649,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) @@ -1004,17 +759,15 @@ index c1892f733f20..fb8c02954901 100644 } else { if (unlikely(!user_mode(regs))) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c -index 5d9da61eff62..7f83a013dccc 100644 +index 5c48eb4fd0e5..77a720c1f402 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c -@@ -680,11 +680,9 @@ void handle_IPI(int ipinr, struct pt_regs *regs) +@@ -671,9 +671,7 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_BACKTRACE: - printk_nmi_enter(); - irq_enter(); - nmi_cpu_backtrace(regs); - irq_exit(); + nmi_cpu_backtrace(get_irq_regs()); - printk_nmi_exit(); break; @@ -1043,134 +796,11 @@ index efa402025031..59487ee9fd61 100644 do_bad_area(addr, fsr, regs); return 0; } -diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c -index 187fab227b50..954a115f6205 100644 ---- a/arch/arm/mm/highmem.c -+++ b/arch/arm/mm/highmem.c -@@ -31,8 +31,14 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) - return *ptep; - } - -+static unsigned int fixmap_idx(int type) -+{ -+ return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); -+} -+ - void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) - { -+ pte_t pte = mk_pte(page, kmap_prot); - unsigned int idx; - unsigned long vaddr; - void *kmap; -@@ -53,7 +59,7 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) - - type = kmap_atomic_idx_push(); - -- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); -+ idx = fixmap_idx(type); - vaddr = __fix_to_virt(idx); - #ifdef CONFIG_DEBUG_HIGHMEM - /* -@@ -61,13 +67,16 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) - * Make sure it was indeed properly unmapped. - */ - BUG_ON(!pte_none(get_fixmap_pte(vaddr))); -+#endif -+#ifdef CONFIG_PREEMPT_RT -+ current->kmap_pte[type] = pte; - #endif - /* - * When debugging is off, kunmap_atomic leaves the previous mapping - * in place, so the contained TLB flush ensures the TLB is updated - * with the new mapping. - */ -- set_fixmap_pte(idx, mk_pte(page, prot)); -+ set_fixmap_pte(idx, pte); - - return (void *)vaddr; - } -@@ -80,16 +89,19 @@ void kunmap_atomic_high(void *kvaddr) - - if (kvaddr >= (void *)FIXADDR_START) { - type = kmap_atomic_idx(); -- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); -+ idx = fixmap_idx(type); - - if (cache_is_vivt()) - __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); -+#ifdef CONFIG_PREEMPT_RT -+ current->kmap_pte[type] = __pte(0); -+#endif - #ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr != __fix_to_virt(idx)); -- set_fixmap_pte(idx, __pte(0)); - #else - (void) idx; /* to kill a warning */ - #endif -+ set_fixmap_pte(idx, __pte(0)); - kmap_atomic_idx_pop(); - } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - /* this address was obtained through kmap_high_get() */ -@@ -100,22 +112,51 @@ EXPORT_SYMBOL(kunmap_atomic_high); - - void *kmap_atomic_pfn(unsigned long pfn) - { -+ pte_t pte = pfn_pte(pfn, kmap_prot); - unsigned long vaddr; - int idx, type; - struct page *page = pfn_to_page(pfn); - -- preempt_disable(); -+ migrate_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); -- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); -+ idx = fixmap_idx(type); - vaddr = __fix_to_virt(idx); - #ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(get_fixmap_pte(vaddr))); - #endif -- set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); -+#ifdef CONFIG_PREEMPT_RT -+ current->kmap_pte[type] = pte; -+#endif -+ set_fixmap_pte(idx, pte); - - return (void *)vaddr; - } -+ -+#if defined CONFIG_PREEMPT_RT -+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) -+{ -+ int i; -+ -+ /* -+ * Clear @prev's kmap_atomic mappings -+ */ -+ for (i = 0; i < prev_p->kmap_idx; i++) { -+ int idx = fixmap_idx(i); -+ -+ set_fixmap_pte(idx, __pte(0)); -+ } -+ /* -+ * Restore @next_p's kmap_atomic mappings -+ */ -+ for (i = 0; i < next_p->kmap_idx; i++) { -+ int idx = fixmap_idx(i); -+ -+ if (!pte_none(next_p->kmap_pte[i])) -+ set_fixmap_pte(idx, next_p->kmap_pte[i]); -+ } -+} -+#endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 6d232837cbee..4da217d5b84c 100644 +index f39568b28ec1..39bcde5ff5ec 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig -@@ -75,6 +75,7 @@ config ARM64 +@@ -76,6 +76,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) select ARCH_SUPPORTS_NUMA_BALANCING @@ -1178,15 +808,15 @@ index 6d232837cbee..4da217d5b84c 100644 select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT -@@ -169,6 +170,7 @@ config ARM64 - select HAVE_PERF_EVENTS +@@ -177,6 +178,7 @@ config ARM64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP -+ select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API ++ select HAVE_PREEMPT_LAZY select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUTEX_CMPXCHG if FUTEX -@@ -190,6 +192,7 @@ config ARM64 + select MMU_GATHER_RCU_TABLE_FREE +@@ -197,6 +199,7 @@ config ARM64 select PCI_DOMAINS_GENERIC if PCI select PCI_ECAM if (ACPI && PCI) select PCI_SYSCALL if PCI @@ -1259,10 +889,10 @@ index 18782f0c4721..6672b05350b4 100644 #include <asm-generic/qrwlock_types.h> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h -index 5e784e16ee89..5d9f16f92c11 100644 +index 9f4e3b266f21..d3fa570c7235 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h -@@ -29,6 +29,7 @@ struct thread_info { +@@ -26,6 +26,7 @@ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif @@ -1270,34 +900,37 @@ index 5e784e16ee89..5d9f16f92c11 100644 union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { -@@ -67,6 +68,7 @@ void arch_release_task_struct(struct task_struct *tsk); - #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ +@@ -65,6 +66,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ - #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ -+#define TIF_NEED_RESCHED_LAZY 6 + #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ + #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 7 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ -@@ -93,14 +95,16 @@ void arch_release_task_struct(struct task_struct *tsk); - #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) - #define _TIF_UPROBE (1 << TIF_UPROBE) - #define _TIF_FSCHECK (1 << TIF_FSCHECK) -+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) - #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) - #define _TIF_32BIT (1 << TIF_32BIT) +@@ -95,8 +97,10 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SVE (1 << TIF_SVE) + #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) - #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ -- _TIF_UPROBE | _TIF_FSCHECK) -+ _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY) - -+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) - #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ + _TIF_NOTIFY_SIGNAL) +@@ -105,6 +109,8 @@ void arch_release_task_struct(struct task_struct *tsk); _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) + ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) ++ + #ifdef CONFIG_SHADOW_CALL_STACK + #define INIT_SCS \ + .scs_base = init_shadow_call_stack, \ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c -index 7d32fc959b1a..b2f29bd2ae87 100644 +index 301784463587..993589687994 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -30,6 +30,7 @@ int main(void) @@ -1305,14 +938,14 @@ index 7d32fc959b1a..b2f29bd2ae87 100644 DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); - DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); + #endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S -index 55af8b504b65..de7cd345c633 100644 +index c9bae73f2621..1d3b9ceefb15 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S -@@ -624,9 +624,18 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING +@@ -678,9 +678,18 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING mrs x0, daif orr x24, x24, x0 alternative_else_nop_endif @@ -1332,12 +965,12 @@ index 55af8b504b65..de7cd345c633 100644 +2: #endif - #ifdef CONFIG_ARM64_PSEUDO_NMI + mov x0, sp diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c -index 55c8f3ec6705..638a41b27feb 100644 +index 062b21f30f94..0ea2df6554e5 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c -@@ -224,6 +224,16 @@ static void sve_free(struct task_struct *task) +@@ -226,6 +226,16 @@ static void sve_free(struct task_struct *task) __sve_free(task); } @@ -1354,7 +987,7 @@ index 55c8f3ec6705..638a41b27feb 100644 /* * TIF_SVE controls whether a task can use SVE without trapping while * in userspace, and also the way a task's FPSIMD/SVE state is stored -@@ -1020,6 +1030,7 @@ void fpsimd_thread_switch(struct task_struct *next) +@@ -1022,6 +1032,7 @@ void fpsimd_thread_switch(struct task_struct *next) void fpsimd_flush_thread(void) { int vl, supported_vl; @@ -1362,7 +995,7 @@ index 55c8f3ec6705..638a41b27feb 100644 if (!system_supports_fpsimd()) return; -@@ -1032,7 +1043,7 @@ void fpsimd_flush_thread(void) +@@ -1034,7 +1045,7 @@ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); @@ -1371,7 +1004,7 @@ index 55c8f3ec6705..638a41b27feb 100644 /* * Reset the task vector length as required. -@@ -1066,6 +1077,7 @@ void fpsimd_flush_thread(void) +@@ -1068,6 +1079,7 @@ void fpsimd_flush_thread(void) } put_cpu_fpsimd_context(); @@ -1380,23 +1013,23 @@ index 55c8f3ec6705..638a41b27feb 100644 /* diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c -index 3b4f31f35e45..441a970fc7ce 100644 +index 6237486ff6bb..ab411f336c39 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c -@@ -921,7 +921,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, - /* Check valid user FS if needed */ - addr_limit_user_check(); - +@@ -915,7 +915,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, + unsigned long thread_flags) + { + do { - if (thread_flags & _TIF_NEED_RESCHED) { + if (thread_flags & _TIF_NEED_RESCHED_MASK) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c -index b588c3b5c2f0..19cb5b101226 100644 +index fe60d25c000e..c8ad6b98fbab 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c -@@ -681,7 +681,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +@@ -732,7 +732,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * involves poking the GIC, which must be done in a * non-preemptible context. */ @@ -1405,7 +1038,7 @@ index b588c3b5c2f0..19cb5b101226 100644 kvm_pmu_flush_hwstate(vcpu); -@@ -730,7 +730,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +@@ -781,7 +781,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); @@ -1414,15 +1047,15 @@ index b588c3b5c2f0..19cb5b101226 100644 continue; } -@@ -802,7 +802,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +@@ -853,7 +853,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); - preempt_enable(); + migrate_enable(); - ret = handle_exit(vcpu, ret); - } + /* + * The ARMv8 architecture doesn't give the hypervisor diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h index 19d233497ba5..de72fb23016d 100644 --- a/arch/hexagon/include/asm/spinlock_types.h @@ -1453,32 +1086,19 @@ index 6e345fefcdca..681408d6816f 100644 typedef struct { volatile unsigned int lock; } arch_spinlock_t; -diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig -index 8f328298f8cc..7624e089b736 100644 ---- a/arch/mips/Kconfig -+++ b/arch/mips/Kconfig -@@ -2653,7 +2653,7 @@ config MIPS_CRC_SUPPORT - # - config HIGHMEM - bool "High Memory Support" -- depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA -+ depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT - - config CPU_SUPPORTS_HIGHMEM - bool diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index 787e829b6f25..7c4e90562f90 100644 +index 107bb4319e0e..876c0f683e2b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig -@@ -143,6 +143,7 @@ config PPC - select ARCH_MIGHT_HAVE_PC_SERIO +@@ -147,6 +147,7 @@ config PPC select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_SUPPORTS_ATOMIC_RMW -+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC32 || PPC_BOOK3S_64 ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS -@@ -225,6 +226,7 @@ config PPC +@@ -233,6 +234,7 @@ config PPC select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP @@ -1486,22 +1106,39 @@ index 787e829b6f25..7c4e90562f90 100644 select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API -@@ -246,6 +248,7 @@ config PPC - select OLD_SIGSUSPEND - select PCI_DOMAINS if PCI - select PCI_SYSCALL if PCI -+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM - select PPC_DAWR if PPC64 - select RTC_LIB - select SPARSE_IRQ -@@ -403,7 +406,7 @@ menu "Kernel options" - - config HIGHMEM - bool "High memory support" -- depends on PPC32 -+ depends on PPC32 && !PREEMPT_RT +@@ -240,6 +242,7 @@ config PPC + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_IRQ_TIME_ACCOUNTING ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_RSEQ + select IOMMU_HELPER if PPC64 + select IRQ_DOMAIN +diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h +index cf091c4c22e5..7371f7e23c35 100644 +--- a/arch/powerpc/include/asm/cmpxchg.h ++++ b/arch/powerpc/include/asm/cmpxchg.h +@@ -5,7 +5,7 @@ + #ifdef __KERNEL__ + #include <linux/compiler.h> + #include <asm/synch.h> +-#include <linux/bug.h> ++#include <linux/bits.h> + + #ifdef __BIG_ENDIAN + #define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) +diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h +index 0f3cdd8faa95..d45561e9e6ba 100644 +--- a/arch/powerpc/include/asm/simple_spinlock_types.h ++++ b/arch/powerpc/include/asm/simple_spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H - source "kernel/Kconfig.hz" +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) + # error "please don't include this file directly" + #endif diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index c5d742f18021..cc6922a011ba 100644 @@ -1535,7 +1172,7 @@ index 1c8460e23583..b1653c160bab 100644 canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h -index ca6c97025704..3eda5d38c418 100644 +index 3d8a47af7a25..03442b9afcfb 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -48,6 +48,8 @@ @@ -1547,7 +1184,7 @@ index ca6c97025704..3eda5d38c418 100644 unsigned long local_flags; /* private flags for thread */ #ifdef CONFIG_LIVEPATCH unsigned long *livepatch_sp; -@@ -98,11 +100,12 @@ void arch_setup_new_exec(void); +@@ -96,11 +98,12 @@ void arch_setup_new_exec(void); #define TIF_SINGLESTEP 8 /* singlestepping active */ #define TIF_NOHZ 9 /* in adaptive nohz mode */ #define TIF_SECCOMP 10 /* secure computing */ @@ -1563,7 +1200,7 @@ index ca6c97025704..3eda5d38c418 100644 #define TIF_EMULATE_STACK_STORE 16 /* Is an instruction emulation for stack store? */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ -@@ -111,6 +114,9 @@ void arch_setup_new_exec(void); +@@ -109,6 +112,9 @@ void arch_setup_new_exec(void); #endif #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 20 /* 32 bit binary */ @@ -1573,30 +1210,31 @@ index ca6c97025704..3eda5d38c418 100644 /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) -@@ -130,6 +136,7 @@ void arch_setup_new_exec(void); +@@ -129,16 +135,19 @@ void arch_setup_new_exec(void); #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) #define _TIF_NOHZ (1<<TIF_NOHZ) +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) - #define _TIF_FSCHECK (1<<TIF_FSCHECK) #define _TIF_SYSCALL_EMU (1<<TIF_SYSCALL_EMU) #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ -@@ -139,8 +146,9 @@ void arch_setup_new_exec(void); + _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_NOHZ | _TIF_SYSCALL_EMU) + #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ ++ _TIF_NEED_RESCHED_LAZY | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ -- _TIF_FSCHECK) -+ _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY) + _TIF_NOTIFY_SIGNAL) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) /* Bits in local_flags */ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c -index 8711c2164b45..3ded638add45 100644 +index b12d7c049bfe..c098f496bbee 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c -@@ -188,6 +188,7 @@ int main(void) +@@ -191,6 +191,7 @@ int main(void) OFFSET(TI_FLAGS, thread_info, flags); OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); OFFSET(TI_PREEMPT, thread_info, preempt_count); @@ -1605,10 +1243,10 @@ index 8711c2164b45..3ded638add45 100644 #ifdef CONFIG_PPC64 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S -index f4d0af8e1136..22907e641938 100644 +index 1c9b0ccc2172..2650aea9d3c6 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S -@@ -415,7 +415,9 @@ ret_from_syscall: +@@ -420,7 +420,9 @@ ret_from_syscall: mtmsr r10 lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO @@ -1619,7 +1257,7 @@ index f4d0af8e1136..22907e641938 100644 bne- syscall_exit_work cmplw 0,r3,r8 blt+ syscall_exit_cont -@@ -532,13 +534,13 @@ syscall_dotrace: +@@ -537,13 +539,13 @@ syscall_dotrace: b syscall_dotrace_cont syscall_exit_work: @@ -1635,7 +1273,7 @@ index f4d0af8e1136..22907e641938 100644 bne- 1f lwz r11,_CCR(r1) /* Load CR */ neg r3,r3 -@@ -547,12 +549,12 @@ syscall_exit_work: +@@ -552,12 +554,12 @@ syscall_exit_work: 1: stw r6,RESULT(r1) /* Save result */ stw r3,GPR3(r1) /* Update return value */ @@ -1650,7 +1288,7 @@ index f4d0af8e1136..22907e641938 100644 addi r12,r2,TI_FLAGS 3: lwarx r8,0,r12 andc r8,r8,r11 -@@ -942,7 +944,14 @@ resume_kernel: +@@ -940,7 +942,14 @@ resume_kernel: cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ bne restore_kuap andi. r8,r8,_TIF_NEED_RESCHED @@ -1665,7 +1303,7 @@ index f4d0af8e1136..22907e641938 100644 lwz r3,_MSR(r1) andi. r0,r3,MSR_EE /* interrupts off? */ beq restore_kuap /* don't schedule if so */ -@@ -1265,7 +1274,7 @@ global_dbcr0: +@@ -1258,7 +1267,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ @@ -1674,8 +1312,8 @@ index f4d0af8e1136..22907e641938 100644 beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ -@@ -1286,7 +1295,7 @@ recheck: - SYNC +@@ -1277,7 +1286,7 @@ recheck: + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) mtmsr r10 /* disable interrupts */ lwz r9,TI_FLAGS(r2) - andi. r0,r9,_TIF_NEED_RESCHED @@ -1684,10 +1322,10 @@ index f4d0af8e1136..22907e641938 100644 andi. r0,r9,_TIF_USER_WORK_MASK beq restore_user diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S -index d9ed79415100..50f7a1b8a6c8 100644 +index 74d07dc0bb48..a241bb078aa9 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S -@@ -1081,7 +1081,7 @@ _GLOBAL(ret_from_except_lite) +@@ -1080,7 +1080,7 @@ _GLOBAL(ret_from_except_lite) li r10, -1 mtspr SPRN_DBSR,r10 b restore @@ -1696,7 +1334,7 @@ index d9ed79415100..50f7a1b8a6c8 100644 beq 2f bl restore_interrupts SCHEDULE_USER -@@ -1133,12 +1133,20 @@ resume_kernel: +@@ -1132,12 +1132,20 @@ resume_kernel: bne- 0b 1: @@ -1719,7 +1357,7 @@ index d9ed79415100..50f7a1b8a6c8 100644 cmpwi cr0,r8,0 bne restore ld r0,SOFTE(r1) -@@ -1159,7 +1167,7 @@ resume_kernel: +@@ -1158,7 +1166,7 @@ resume_kernel: * interrupted after loading SRR0/1. */ wrteei 0 @@ -1729,10 +1367,10 @@ index d9ed79415100..50f7a1b8a6c8 100644 restore: /* diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c -index bf21ebd36190..171e41ae389b 100644 +index cc7a6271b6b4..ef164a39cfa8 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c -@@ -784,10 +784,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; +@@ -728,10 +728,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; void *softirq_ctx[NR_CPUS] __read_mostly; void *hardirq_ctx[NR_CPUS] __read_mostly; @@ -1746,7 +1384,7 @@ index bf21ebd36190..171e41ae389b 100644 irq_hw_number_t virq_to_hw(unsigned int virq) { diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S -index b24f866fef81..a4b903822d50 100644 +index 717e658b90fd..08ee95ad6593 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -31,6 +31,7 @@ @@ -1766,7 +1404,7 @@ index b24f866fef81..a4b903822d50 100644 /* * void call_do_irq(struct pt_regs *regs, void *sp); diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S -index 7bb46ad98207..442832bf599f 100644 +index 070465825c21..a6b33f7b3264 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -27,6 +27,7 @@ @@ -1785,11 +1423,52 @@ index 7bb46ad98207..442832bf599f 100644 _GLOBAL(call_do_irq) mflr r0 +diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c +index 532f22637783..1ef55f4b389a 100644 +--- a/arch/powerpc/kernel/nvram_64.c ++++ b/arch/powerpc/kernel/nvram_64.c +@@ -73,7 +73,8 @@ static const char *nvram_os_partitions[] = { + }; + + static void oops_to_nvram(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason); ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter); + + static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +@@ -643,7 +644,8 @@ void __init nvram_init_oops_partition(int rtas_partition_exists) + * partition. If that's too much, go back and capture uncompressed text. + */ + static void oops_to_nvram(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + static unsigned int oops_count = 0; +@@ -681,13 +683,13 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, + return; + + if (big_oops_buf) { +- kmsg_dump_get_buffer(dumper, false, ++ kmsg_dump_get_buffer(iter, false, + big_oops_buf, big_oops_buf_sz, &text_len); + rc = zip_oops(text_len); + } + if (rc != 0) { +- kmsg_dump_rewind(dumper); +- kmsg_dump_get_buffer(dumper, false, ++ kmsg_dump_rewind(iter); ++ kmsg_dump_get_buffer(iter, false, + oops_data, oops_data_sz, &text_len); + err_type = ERR_TYPE_KERNEL_PANIC; + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c -index 8e50818aa50b..4be309aedbb4 100644 +index 7c85ed04a164..092c014b0653 100644 --- a/arch/powerpc/kernel/syscall_64.c +++ b/arch/powerpc/kernel/syscall_64.c -@@ -193,7 +193,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, +@@ -217,7 +217,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, ti_flags = READ_ONCE(*ti_flagsp); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); @@ -1798,7 +1477,7 @@ index 8e50818aa50b..4be309aedbb4 100644 schedule(); } else { /* -@@ -277,7 +277,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned +@@ -307,7 +307,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned ti_flags = READ_ONCE(*ti_flagsp); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); /* returning to user: may enable */ @@ -1807,7 +1486,7 @@ index 8e50818aa50b..4be309aedbb4 100644 schedule(); } else { if (ti_flags & _TIF_SIGPENDING) -@@ -361,11 +361,14 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign +@@ -395,11 +395,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: @@ -1818,13 +1497,14 @@ index 8e50818aa50b..4be309aedbb4 100644 if (preempt_count() == 0) preempt_schedule_irq(); + } else if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED_LAZY)) { -+ if (current_thread_info()->preempt_lazy_count == 0) ++ if ((preempt_count() == 0) && ++ (current_thread_info()->preempt_lazy_count == 0)) + preempt_schedule_irq(); } } diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c -index d1ebe152f210..f5a1468b8d5b 100644 +index 3ec7b443fe6b..e7bb99775ffe 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -170,7 +170,6 @@ extern void panic_flush_kmsg_start(void) @@ -1896,8 +1576,42 @@ index 549591d9aaa2..efb5bfe93f70 100644 select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING +diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c +index afab328d0887..d6c3f0b79f1d 100644 +--- a/arch/powerpc/mm/mem.c ++++ b/arch/powerpc/mm/mem.c +@@ -54,7 +54,6 @@ + + #include <mm/mmu_decl.h> + +-static DEFINE_MUTEX(linear_mapping_mutex); + unsigned long long memory_limit; + bool init_mem_is_free; + +@@ -72,6 +71,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + EXPORT_SYMBOL(phys_mem_access_prot); + + #ifdef CONFIG_MEMORY_HOTPLUG ++static DEFINE_MUTEX(linear_mapping_mutex); + + #ifdef CONFIG_NUMA + int memory_add_physaddr_to_nid(u64 start) +diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c +index 6c3bc4b4da98..ec862846bc82 100644 +--- a/arch/powerpc/platforms/powernv/opal-kmsg.c ++++ b/arch/powerpc/platforms/powernv/opal-kmsg.c +@@ -20,7 +20,8 @@ + * message, it just ensures that OPAL completely flushes the console buffer. + */ + static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + /* + * Outside of a panic context the pollers will continue to run, diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c -index 6d47b4a3ce39..15eef4d607ed 100644 +index 9fc5217f0c8e..4fdb9370b913 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -24,6 +24,7 @@ @@ -1908,68 +1622,114 @@ index 6d47b4a3ce39..15eef4d607ed 100644 #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> -@@ -177,6 +178,7 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, +@@ -190,7 +191,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + return ret; } - static DEFINE_PER_CPU(__be64 *, tce_page); -+static DEFINE_LOCAL_IRQ_LOCK(tcp_page_lock); +-static DEFINE_PER_CPU(__be64 *, tce_page); ++struct tce_page { ++ __be64 * page; ++ local_lock_t lock; ++}; ++static DEFINE_PER_CPU(struct tce_page, tce_page) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, -@@ -198,7 +200,8 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +@@ -212,9 +219,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, direction, attrs); } - local_irq_save(flags); /* to protect tcep and the page behind it */ + /* to protect tcep and the page behind it */ -+ local_lock_irqsave(tcp_page_lock, flags); ++ local_lock_irqsave(&tce_page.lock, flags); - tcep = __this_cpu_read(tce_page); +- tcep = __this_cpu_read(tce_page); ++ tcep = __this_cpu_read(tce_page.page); -@@ -209,7 +212,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + /* This is safe to do since interrupts are off when we're called + * from iommu_alloc{,_sg}() +@@ -223,12 +231,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - local_irq_restore(flags); -+ local_unlock_irqrestore(tcp_page_lock, flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, tbl->it_page_shift, npages, uaddr, direction, attrs); -@@ -244,7 +247,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + rpn = __pa(uaddr) >> TCE_SHIFT; +@@ -258,7 +266,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum += limit; } while (npages > 0 && !rc); - local_irq_restore(flags); -+ local_unlock_irqrestore(tcp_page_lock, flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; -@@ -415,13 +418,14 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, +@@ -429,16 +437,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, DMA_BIDIRECTIONAL, 0); } - local_irq_disable(); /* to protect tcep and the page behind it */ +- tcep = __this_cpu_read(tce_page); + /* to protect tcep and the page behind it */ -+ local_lock_irq(tcp_page_lock); - tcep = __this_cpu_read(tce_page); ++ local_lock_irq(&tce_page.lock); ++ tcep = __this_cpu_read(tce_page.page); if (!tcep) { tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { - local_irq_enable(); -+ local_unlock_irq(tcp_page_lock); ++ local_unlock_irq(&tce_page.lock); return -ENOMEM; } - __this_cpu_write(tce_page, tcep); -@@ -467,7 +471,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; +@@ -481,7 +490,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, /* error cleanup: caller will clear whole range */ - local_irq_enable(); -+ local_unlock_irq(tcp_page_lock); ++ local_unlock_irq(&tce_page.lock); return rc; } +diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c +index dcd817ca2edf..f51367a3b231 100644 +--- a/arch/powerpc/xmon/xmon.c ++++ b/arch/powerpc/xmon/xmon.c +@@ -3005,7 +3005,7 @@ print_address(unsigned long addr) + static void + dump_log_buf(void) + { +- struct kmsg_dumper dumper = { .active = 1 }; ++ struct kmsg_dumper_iter iter = { .active = 1 }; + unsigned char buf[128]; + size_t len; + +@@ -3017,9 +3017,9 @@ dump_log_buf(void) + catch_memory_errors = 1; + sync(); + +- kmsg_dump_rewind_nolock(&dumper); ++ kmsg_dump_rewind(&iter); + xmon_start_pagination(); +- while (kmsg_dump_get_line_nolock(&dumper, false, buf, sizeof(buf), &len)) { ++ while (kmsg_dump_get_line(&iter, false, buf, sizeof(buf), &len)) { + buf[len] = '\0'; + printf("%s", buf); + } diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index cfed272e4fd5..8e28e8176ec8 100644 --- a/arch/s390/include/asm/spinlock_types.h @@ -2001,7 +1761,7 @@ index e82369f286a2..22ca9a98bbb8 100644 volatile unsigned int lock; } arch_spinlock_t; diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c -index 5717c7cbdd97..c4e46252377e 100644 +index ab5f790b0cd2..5db7af565dec 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu) @@ -2040,19 +1800,63 @@ index 3ec9f1402aad..eb21682abfcb 100644 #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(void) +diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c +index 6516ef1f8274..deab9b56b51f 100644 +--- a/arch/um/kernel/kmsg_dump.c ++++ b/arch/um/kernel/kmsg_dump.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + #include <linux/kmsg_dump.h> ++#include <linux/spinlock.h> + #include <linux/console.h> + #include <linux/string.h> + #include <shared/init.h> +@@ -7,10 +8,13 @@ + #include <os.h> + + static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { ++ static DEFINE_SPINLOCK(lock); + static char line[1024]; + struct console *con; ++ unsigned long flags; + size_t len = 0; + + /* only dump kmsg when no console is available */ +@@ -29,11 +33,16 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + if (con) + return; + ++ if (!spin_trylock_irqsave(&lock, flags)) ++ return; ++ + printf("kmsg_dump:\n"); +- while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) { ++ while (kmsg_dump_get_line(iter, true, line, sizeof(line), &len)) { + line[len] = '\0'; + printf("%s", line); + } ++ ++ spin_unlock_irqrestore(&lock, flags); + } + + static struct kmsg_dumper kmsg_dumper = { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 7101ac64bb20..13a1f43f6034 100644 +index 21f851179ff0..f4b80446b40a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig -@@ -92,6 +92,7 @@ config X86 - select ARCH_SUPPORTS_ACPI - select ARCH_SUPPORTS_ATOMIC_RMW +@@ -96,6 +96,7 @@ config X86 + select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS -@@ -208,6 +209,7 @@ config X86 +@@ -215,6 +216,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP @@ -2349,38 +2153,17 @@ index d3d91a0abf88..6d0774721514 100644 nbytes = walk.nbytes; } diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h -index b774c52e5411..7959a9c10e5a 100644 +index 67a4f1cb2aac..41d3be7da969 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h -@@ -23,6 +23,7 @@ extern void kernel_fpu_begin(void); +@@ -28,6 +28,7 @@ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); +extern void kernel_fpu_resched(void); - /* - * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. -@@ -33,12 +34,18 @@ extern void fpregs_mark_activate(void); - static inline void fpregs_lock(void) - { - preempt_disable(); -- local_bh_disable(); -+ /* -+ * On RT disabling preemption is good enough because bottom halfs -+ * are always running in thread context. -+ */ -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) -+ local_bh_disable(); - } - - static inline void fpregs_unlock(void) - { -- local_bh_enable(); -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) -+ local_bh_enable(); - preempt_enable(); - } - + /* Code that is unaware of kernel_fpu_begin_mask() can use this */ + static inline void kernel_fpu_begin(void) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 69485ca13665..471dec2d78e1 100644 --- a/arch/x86/include/asm/preempt.h @@ -2494,51 +2277,41 @@ index 7fb482f0f25b..3df0a95c9e13 100644 canary += tsc + (tsc << 32UL); canary &= CANARY_MASK; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h -index 267701ae3d86..350e342d9f25 100644 +index 0d751d5da702..2e62434951fa 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h -@@ -56,17 +56,24 @@ struct task_struct; - struct thread_info { +@@ -57,11 +57,14 @@ struct thread_info { unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ + int preempt_lazy_count; /* 0 => lazy preemptable -+ <0 => BUG */ ++ <0 => BUG */ }; #define INIT_THREAD_INFO(tsk) \ { \ .flags = 0, \ -+ .preempt_lazy_count = 0, \ ++ .preempt_lazy_count = 0, \ } #else /* !__ASSEMBLY__ */ - - #include <asm/asm-offsets.h> - -+#define GET_THREAD_INFO(reg) \ -+ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ -+ _ASM_SUB $(THREAD_SIZE),reg ; -+ - #endif - - /* -@@ -93,6 +100,7 @@ struct thread_info { +@@ -90,6 +93,7 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ - #define TIF_IA32 17 /* IA32 compatibility process */ + #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ +#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ -@@ -123,6 +131,7 @@ struct thread_info { +@@ -113,6 +117,7 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) - #define _TIF_IA32 (1 << TIF_IA32) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -@@ -156,6 +165,8 @@ struct thread_info { +@@ -143,6 +148,8 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) @@ -2548,10 +2321,10 @@ index 267701ae3d86..350e342d9f25 100644 /* diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c -index 31125448b174..0a3270aadfce 100644 +index 43b54bef5448..2471e53b293f 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c -@@ -75,11 +75,12 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +@@ -80,11 +80,12 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -2566,10 +2339,10 @@ index 31125448b174..0a3270aadfce 100644 set_irq_regs(old_regs); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c -index eb86a2b831b1..23aaf9e132e9 100644 +index 571220ac8bea..d315d45b64fa 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c -@@ -158,6 +158,18 @@ void kernel_fpu_end(void) +@@ -159,6 +159,18 @@ void kernel_fpu_end(void) } EXPORT_SYMBOL_GPL(kernel_fpu_end); @@ -2622,116 +2395,11 @@ index 440eed558558..7cfc4e6b7c94 100644 run_on_irqstack_cond(__do_softirq, NULL); } +#endif -diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c -index 4f2f54e1281c..17189e47cee8 100644 ---- a/arch/x86/kernel/process_32.c -+++ b/arch/x86/kernel/process_32.c -@@ -38,6 +38,7 @@ - #include <linux/io.h> - #include <linux/kdebug.h> - #include <linux/syscalls.h> -+#include <linux/highmem.h> - - #include <asm/ldt.h> - #include <asm/processor.h> -@@ -126,6 +127,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) - } - EXPORT_SYMBOL_GPL(start_thread); - -+#ifdef CONFIG_PREEMPT_RT -+static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) -+{ -+ int i; -+ -+ /* -+ * Clear @prev's kmap_atomic mappings -+ */ -+ for (i = 0; i < prev_p->kmap_idx; i++) { -+ int idx = i + KM_TYPE_NR * smp_processor_id(); -+ pte_t *ptep = kmap_pte - idx; -+ -+ kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx)); -+ } -+ /* -+ * Restore @next_p's kmap_atomic mappings -+ */ -+ for (i = 0; i < next_p->kmap_idx; i++) { -+ int idx = i + KM_TYPE_NR * smp_processor_id(); -+ -+ if (!pte_none(next_p->kmap_pte[i])) -+ set_pte(kmap_pte - idx, next_p->kmap_pte[i]); -+ } -+} -+#else -+static inline void -+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { } -+#endif -+ - - /* - * switch_to(x,y) should switch tasks from x to y. -@@ -187,6 +217,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - - switch_to_extra(prev_p, next_p); - -+ switch_kmaps(prev_p, next_p); -+ - /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so -diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c -index 49d925043171..f70dffc2771f 100644 ---- a/arch/x86/kernel/tsc.c -+++ b/arch/x86/kernel/tsc.c -@@ -54,7 +54,7 @@ struct clocksource *art_related_clocksource; - - struct cyc2ns { - struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ -- seqcount_t seq; /* 32 + 4 = 36 */ -+ seqcount_latch_t seq; /* 32 + 4 = 36 */ - - }; /* fits one cacheline */ - -@@ -73,14 +73,14 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) - preempt_disable_notrace(); - - do { -- seq = this_cpu_read(cyc2ns.seq.sequence); -+ seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); - idx = seq & 1; - - data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); - data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); - data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - -- } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); -+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); - } - - __always_inline void cyc2ns_read_end(void) -@@ -186,7 +186,7 @@ static void __init cyc2ns_init_boot_cpu(void) - { - struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); - -- seqcount_init(&c2n->seq); -+ seqcount_latch_init(&c2n->seq); - __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); - } - -@@ -203,7 +203,7 @@ static void __init cyc2ns_init_secondary_cpus(void) - - for_each_possible_cpu(cpu) { - if (cpu != this_cpu) { -- seqcount_init(&c2n->seq); -+ seqcount_latch_init(&c2n->seq); - c2n = per_cpu_ptr(&cyc2ns, cpu); - c2n->data[0] = data[0]; - c2n->data[1] = data[1]; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -index ce856e0ece84..257dcd9ba1a0 100644 +index 1b404e4d7dd8..babf6c0a7685 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c -@@ -7513,6 +7513,14 @@ int kvm_arch_init(void *opaque) +@@ -7907,6 +7907,14 @@ int kvm_arch_init(void *opaque) goto out; } @@ -2746,84 +2414,6 @@ index ce856e0ece84..257dcd9ba1a0 100644 r = -ENOMEM; x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), __alignof__(struct fpu), SLAB_ACCOUNT, -diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c -index 075fe51317b0..95da91a7c7af 100644 ---- a/arch/x86/mm/highmem_32.c -+++ b/arch/x86/mm/highmem_32.c -@@ -8,12 +8,17 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) - { - unsigned long vaddr; - int idx, type; -+ pte_t pte; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); -- set_pte(kmap_pte-idx, mk_pte(page, prot)); -+ pte = mk_pte(page, prot); -+#ifdef CONFIG_PREEMPT_RT -+ current->kmap_pte[type] = pte; -+#endif -+ set_pte(kmap_pte-idx, pte); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -@@ -50,6 +55,9 @@ void kunmap_atomic_high(void *kvaddr) - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ -+#ifdef CONFIG_PREEMPT_RT -+ current->kmap_pte[type] = __pte(0); -+#endif - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - arch_flush_lazy_mmu_mode(); -diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c -index f60398aeb644..0ef360874c75 100644 ---- a/arch/x86/mm/iomap_32.c -+++ b/arch/x86/mm/iomap_32.c -@@ -46,16 +46,22 @@ EXPORT_SYMBOL_GPL(iomap_free); - - void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) - { -+ pte_t pte = pfn_pte(pfn, prot); - unsigned long vaddr; - int idx, type; - -- preempt_disable(); -+ migrate_disable(); - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -- set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); -+ WARN_ON(!pte_none(*(kmap_pte - idx))); -+ -+#ifdef CONFIG_PREEMPT_RT -+ current->kmap_pte[type] = pte; -+#endif -+ set_pte(kmap_pte - idx, pte); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -@@ -106,11 +112,14 @@ iounmap_atomic(void __iomem *kvaddr) - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ -+#ifdef CONFIG_PREEMPT_RT -+ current->kmap_pte[type] = __pte(0); -+#endif - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - } - - pagefault_enable(); -- preempt_enable(); -+ migrate_enable(); - } - EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h index 64c9389254f1..dc846323b1cd 100644 --- a/arch/xtensa/include/asm/spinlock_types.h @@ -2840,43 +2430,188 @@ index 64c9389254f1..dc846323b1cd 100644 #include <asm-generic/qrwlock_types.h> diff --git a/block/blk-mq.c b/block/blk-mq.c -index cdced4aca2e8..e37aa31332b7 100644 +index f285a9123a8b..e284fc612f10 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c -@@ -604,6 +604,7 @@ static void blk_mq_trigger_softirq(struct request *rq) - if (list->next == &rq->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_restore(flags); -+ preempt_check_resched_rt(); - } +@@ -41,7 +41,7 @@ + #include "blk-mq-sched.h" + #include "blk-rq-qos.h" - static int blk_softirq_cpu_dead(unsigned int cpu) -@@ -617,6 +618,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu) - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); -+ preempt_check_resched_rt(); +-static DEFINE_PER_CPU(struct list_head, blk_cpu_done); ++static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); - return 0; + static void blk_mq_poll_stats_start(struct request_queue *q); + static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); +@@ -567,80 +567,29 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } -@@ -1603,14 +1605,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, - return; - - if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { -- int cpu = get_cpu(); -+ int cpu = get_cpu_light(); - if (cpumask_test_cpu(cpu, hctx->cpumask)) { - __blk_mq_run_hw_queue(hctx); -- put_cpu(); -+ put_cpu_light(); - return; - } + EXPORT_SYMBOL(blk_mq_end_request); -- put_cpu(); -+ put_cpu_light(); - } +-/* +- * Softirq action handler - move entries to local list and loop over them +- * while passing them to the queue registered handler. +- */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static void blk_complete_reqs(struct llist_head *list) + { +- struct list_head *cpu_list, local_list; +- +- local_irq_disable(); +- cpu_list = this_cpu_ptr(&blk_cpu_done); +- list_replace_init(cpu_list, &local_list); +- local_irq_enable(); +- +- while (!list_empty(&local_list)) { +- struct request *rq; ++ struct llist_node *entry = llist_reverse_order(llist_del_all(list)); ++ struct request *rq, *next; + +- rq = list_entry(local_list.next, struct request, ipi_list); +- list_del_init(&rq->ipi_list); ++ llist_for_each_entry_safe(rq, next, entry, ipi_list) + rq->q->mq_ops->complete(rq); +- } + } + +-static void blk_mq_trigger_softirq(struct request *rq) ++static __latent_entropy void blk_done_softirq(struct softirq_action *h) + { +- struct list_head *list; +- unsigned long flags; +- +- local_irq_save(flags); +- list = this_cpu_ptr(&blk_cpu_done); +- list_add_tail(&rq->ipi_list, list); +- +- /* +- * If the list only contains our just added request, signal a raise of +- * the softirq. If there are already entries there, someone already +- * raised the irq but it hasn't run yet. +- */ +- if (list->next == &rq->ipi_list) +- raise_softirq_irqoff(BLOCK_SOFTIRQ); +- local_irq_restore(flags); ++ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); + } + + static int blk_softirq_cpu_dead(unsigned int cpu) + { +- /* +- * If a CPU goes away, splice its entries to the current CPU +- * and trigger a run of the softirq +- */ +- local_irq_disable(); +- list_splice_init(&per_cpu(blk_cpu_done, cpu), +- this_cpu_ptr(&blk_cpu_done)); +- raise_softirq_irqoff(BLOCK_SOFTIRQ); +- local_irq_enable(); +- ++ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); + return 0; + } + +- + static void __blk_mq_complete_request_remote(void *data) + { +- struct request *rq = data; +- +- /* +- * For most of single queue controllers, there is only one irq vector +- * for handling I/O completion, and the only irq's affinity is set +- * to all possible CPUs. On most of ARCHs, this affinity means the irq +- * is handled on one specific CPU. +- * +- * So complete I/O requests in softirq context in case of single queue +- * devices to avoid degrading I/O performance due to irqsoff latency. +- */ +- if (rq->q->nr_hw_queues == 1) +- blk_mq_trigger_softirq(rq); +- else +- rq->q->mq_ops->complete(rq); ++ __raise_softirq_irqoff(BLOCK_SOFTIRQ); + } + + static inline bool blk_mq_complete_need_ipi(struct request *rq) +@@ -669,6 +618,30 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) + return cpu_online(rq->mq_ctx->cpu); + } + ++static void blk_mq_complete_send_ipi(struct request *rq) ++{ ++ struct llist_head *list; ++ unsigned int cpu; ++ ++ cpu = rq->mq_ctx->cpu; ++ list = &per_cpu(blk_cpu_done, cpu); ++ if (llist_add(&rq->ipi_list, list)) { ++ INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); ++ smp_call_function_single_async(cpu, &rq->csd); ++ } ++} ++ ++static void blk_mq_raise_softirq(struct request *rq) ++{ ++ struct llist_head *list; ++ ++ preempt_disable(); ++ list = this_cpu_ptr(&blk_cpu_done); ++ if (llist_add(&rq->ipi_list, list)) ++ raise_softirq(BLOCK_SOFTIRQ); ++ preempt_enable(); ++} ++ + bool blk_mq_complete_request_remote(struct request *rq) + { + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); +@@ -681,15 +654,15 @@ bool blk_mq_complete_request_remote(struct request *rq) + return false; + + if (blk_mq_complete_need_ipi(rq)) { +- INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); +- smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); +- } else { +- if (rq->q->nr_hw_queues > 1) +- return false; +- blk_mq_trigger_softirq(rq); ++ blk_mq_complete_send_ipi(rq); ++ return true; + } + +- return true; ++ if (rq->q->nr_hw_queues == 1) { ++ blk_mq_raise_softirq(rq); ++ return true; ++ } ++ return false; + } + EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +@@ -1587,14 +1560,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, + return; + + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { +- int cpu = get_cpu(); ++ int cpu = get_cpu_light(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); +- put_cpu(); ++ put_cpu_light(); + return; + } + +- put_cpu(); ++ put_cpu_light(); + } kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, +@@ -3904,7 +3877,7 @@ static int __init blk_mq_init(void) + int i; + + for_each_possible_cpu(i) +- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); ++ init_llist_head(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, diff --git a/crypto/cryptd.c b/crypto/cryptd.c index a1bea0f4baa8..5f8ca8c1f59c 100644 --- a/crypto/cryptd.c @@ -2938,96 +2673,24 @@ index a1bea0f4baa8..5f8ca8c1f59c 100644 if (!req) return; -diff --git a/drivers/base/core.c b/drivers/base/core.c -index bb5806a2bd4c..f90e9f77bf8c 100644 ---- a/drivers/base/core.c -+++ b/drivers/base/core.c -@@ -4061,22 +4061,21 @@ void device_shutdown(void) - */ - - #ifdef CONFIG_PRINTK --static int --create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) -+static void -+set_dev_info(const struct device *dev, struct dev_printk_info *dev_info) - { - const char *subsys; -- size_t pos = 0; -+ -+ memset(dev_info, 0, sizeof(*dev_info)); - - if (dev->class) - subsys = dev->class->name; - else if (dev->bus) - subsys = dev->bus->name; - else -- return 0; -+ return; - -- pos += snprintf(hdr + pos, hdrlen - pos, "SUBSYSTEM=%s", subsys); -- if (pos >= hdrlen) -- goto overflow; -+ strscpy(dev_info->subsystem, subsys, sizeof(dev_info->subsystem)); - - /* - * Add device identifier DEVICE=: -@@ -4092,41 +4091,28 @@ create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) - c = 'b'; - else - c = 'c'; -- pos++; -- pos += snprintf(hdr + pos, hdrlen - pos, -- "DEVICE=%c%u:%u", -- c, MAJOR(dev->devt), MINOR(dev->devt)); -+ -+ snprintf(dev_info->device, sizeof(dev_info->device), -+ "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt)); - } else if (strcmp(subsys, "net") == 0) { - struct net_device *net = to_net_dev(dev); - -- pos++; -- pos += snprintf(hdr + pos, hdrlen - pos, -- "DEVICE=n%u", net->ifindex); -+ snprintf(dev_info->device, sizeof(dev_info->device), -+ "n%u", net->ifindex); - } else { -- pos++; -- pos += snprintf(hdr + pos, hdrlen - pos, -- "DEVICE=+%s:%s", subsys, dev_name(dev)); -+ snprintf(dev_info->device, sizeof(dev_info->device), -+ "+%s:%s", subsys, dev_name(dev)); +diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c +index 316a9947541f..e96a4e8a4a10 100644 +--- a/drivers/atm/eni.c ++++ b/drivers/atm/eni.c +@@ -2054,7 +2054,7 @@ static int eni_send(struct atm_vcc *vcc,struct sk_buff *skb) } -- -- if (pos >= hdrlen) -- goto overflow; -- -- return pos; -- --overflow: -- dev_WARN(dev, "device/subsystem name too long"); -- return 0; - } - - int dev_vprintk_emit(int level, const struct device *dev, - const char *fmt, va_list args) - { -- char hdr[128]; -- size_t hdrlen; -+ struct dev_printk_info dev_info; - -- hdrlen = create_syslog_header(dev, hdr, sizeof(hdr)); -+ set_dev_info(dev, &dev_info); - -- return vprintk_emit(0, level, hdrlen ? hdr : NULL, hdrlen, fmt, args); -+ return vprintk_emit(0, level, &dev_info, fmt, args); - } - EXPORT_SYMBOL(dev_vprintk_emit); - + submitted++; + ATM_SKB(skb)->vcc = vcc; +- tasklet_disable(&ENI_DEV(vcc->dev)->task); ++ tasklet_disable_in_atomic(&ENI_DEV(vcc->dev)->task); + res = do_tx(skb); + tasklet_enable(&ENI_DEV(vcc->dev)->task); + if (res == enq_ok) return 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c -index 9100ac36670a..4bd301dd22c3 100644 +index e2933cb7a82a..cc2aeefa20c8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c -@@ -56,6 +56,40 @@ static void zram_free_page(struct zram *zram, size_t index); +@@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); @@ -3068,7 +2731,7 @@ index 9100ac36670a..4bd301dd22c3 100644 static int zram_slot_trylock(struct zram *zram, u32 index) { -@@ -71,6 +105,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) +@@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } @@ -3076,7 +2739,7 @@ index 9100ac36670a..4bd301dd22c3 100644 static inline bool init_done(struct zram *zram) { -@@ -1158,6 +1193,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) +@@ -1165,6 +1200,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); @@ -3084,40 +2747,8 @@ index 9100ac36670a..4bd301dd22c3 100644 return true; } -@@ -1220,6 +1256,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, - unsigned long handle; - unsigned int size; - void *src, *dst; -+ struct zcomp_strm *zstrm; - - zram_slot_lock(zram, index); - if (zram_test_flag(zram, index, ZRAM_WB)) { -@@ -1250,6 +1287,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, - - size = zram_get_obj_size(zram, index); - -+ zstrm = zcomp_stream_get(zram->comp); - src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); - if (size == PAGE_SIZE) { - dst = kmap_atomic(page); -@@ -1257,14 +1295,13 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, - kunmap_atomic(dst); - ret = 0; - } else { -- struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); - - dst = kmap_atomic(page); - ret = zcomp_decompress(zstrm, src, size, dst); - kunmap_atomic(dst); -- zcomp_stream_put(zram->comp); - } - zs_unmap_object(zram->mem_pool, handle); -+ zcomp_stream_put(zram->comp); - zram_slot_unlock(zram, index); - - /* Should NEVER happen. Return bio error if it does. */ diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h -index f2fd46daa760..7e4dd447e1dd 100644 +index 419a7e8281ee..561c7ba1421f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -63,6 +63,7 @@ struct zram_table_entry { @@ -3129,7 +2760,7 @@ index f2fd46daa760..7e4dd447e1dd 100644 ktime_t ac_time; #endif diff --git a/drivers/char/random.c b/drivers/char/random.c -index d20ba1b104ca..081a1a4b9d70 100644 +index 5f3b8ac9d97b..ee92e44484a8 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1252,28 +1252,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) @@ -3179,10 +2810,10 @@ index 1784530b8387..c08cbb306636 100644 static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space, u8 *buf, size_t bufsiz) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c -index 0b214963539d..39cee10c59f8 100644 +index 4ed6e660273a..c2bd0d40b5fc 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c -@@ -49,6 +49,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da +@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da return container_of(data, struct tpm_tis_tcg_phy, priv); } @@ -3211,10 +2842,10 @@ index 0b214963539d..39cee10c59f8 100644 + tpm_tis_flush(iobase); +} + - static bool interrupts = true; - module_param(interrupts, bool, 0444); + static int interrupts = -1; + module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); -@@ -146,7 +171,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, +@@ -169,7 +194,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); while (len--) @@ -3223,7 +2854,7 @@ index 0b214963539d..39cee10c59f8 100644 return 0; } -@@ -173,7 +198,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) +@@ -196,7 +221,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) { struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); @@ -3232,11 +2863,33 @@ index 0b214963539d..39cee10c59f8 100644 return 0; } +diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c +index 9811c40956e5..17c9d825188b 100644 +--- a/drivers/firewire/ohci.c ++++ b/drivers/firewire/ohci.c +@@ -2545,7 +2545,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet) + struct driver_data *driver_data = packet->driver_data; + int ret = -ENOENT; + +- tasklet_disable(&ctx->tasklet); ++ tasklet_disable_in_atomic(&ctx->tasklet); + + if (packet->ack != 0) + goto out; +@@ -3465,7 +3465,7 @@ static int ohci_flush_iso_completions(struct fw_iso_context *base) + struct iso_context *ctx = container_of(base, struct iso_context, base); + int ret = 0; + +- tasklet_disable(&ctx->context.tasklet); ++ tasklet_disable_in_atomic(&ctx->context.tasklet); + + if (!test_and_set_bit_lock(0, &ctx->flushing_completions)) { + context_tasklet((unsigned long)&ctx->context); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c -index 3aa07c3b5136..25a8e81d1742 100644 +index df3f9bcab581..709c65c0a816 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c -@@ -62,7 +62,7 @@ struct mm_struct efi_mm = { +@@ -66,7 +66,7 @@ struct mm_struct efi_mm = { struct workqueue_struct *efi_rts_wq; @@ -3245,7 +2898,7 @@ index 3aa07c3b5136..25a8e81d1742 100644 static int __init setup_noefi(char *arg) { disable_runtime = true; -@@ -93,6 +93,9 @@ static int __init parse_efi_cmdline(char *str) +@@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) if (parse_option_str(str, "noruntime")) disable_runtime = true; @@ -3256,10 +2909,10 @@ index 3aa07c3b5136..25a8e81d1742 100644 set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c b/drivers/gpu/drm/i915/display/intel_sprite.c -index c89f5f7ccb06..1b9d5e690a9f 100644 +index 3da2544fa1c0..338ed08a2e0e 100644 --- a/drivers/gpu/drm/i915/display/intel_sprite.c +++ b/drivers/gpu/drm/i915/display/intel_sprite.c -@@ -118,7 +118,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) +@@ -122,7 +122,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) "PSR idle timed out 0x%x, atomic update may fail\n", psr_status); @@ -3269,7 +2922,7 @@ index c89f5f7ccb06..1b9d5e690a9f 100644 crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; -@@ -143,11 +144,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) +@@ -147,11 +148,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) break; } @@ -3285,7 +2938,7 @@ index c89f5f7ccb06..1b9d5e690a9f 100644 } finish_wait(wq, &wait); -@@ -180,7 +183,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) +@@ -184,7 +187,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) return; irq_disable: @@ -3295,7 +2948,7 @@ index c89f5f7ccb06..1b9d5e690a9f 100644 } /** -@@ -218,7 +222,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) +@@ -233,7 +237,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) new_crtc_state->uapi.event = NULL; } @@ -3306,10 +2959,10 @@ index c89f5f7ccb06..1b9d5e690a9f 100644 if (intel_vgpu_active(dev_priv)) return; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c -index 8ec3eecf3e39..31bb90153041 100644 +index 499b09cb4acf..777fd6010f48 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c -@@ -59,9 +59,10 @@ static int __engine_unpark(struct intel_wakeref *wf) +@@ -73,9 +73,10 @@ static int __engine_unpark(struct intel_wakeref *wf) static inline unsigned long __timeline_mark_lock(struct intel_context *ce) { @@ -3322,7 +2975,7 @@ index 8ec3eecf3e39..31bb90153041 100644 mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); return flags; -@@ -71,7 +72,8 @@ static inline void __timeline_mark_unlock(struct intel_context *ce, +@@ -85,7 +86,8 @@ static inline void __timeline_mark_unlock(struct intel_context *ce, unsigned long flags) { mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); @@ -3333,10 +2986,10 @@ index 8ec3eecf3e39..31bb90153041 100644 #else diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c -index 1fa67700d8f4..4ee735b91e97 100644 +index 6cdb052e3850..c9c39723bf49 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c -@@ -865,6 +865,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +@@ -878,6 +878,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ @@ -3344,7 +2997,7 @@ index 1fa67700d8f4..4ee735b91e97 100644 /* Get optional system timestamp before query. */ if (stime) -@@ -916,6 +917,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +@@ -929,6 +930,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, *etime = ktime_get(); /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ @@ -3377,10 +3030,10 @@ index a4addcc64978..396b6598694d 100644 TP_PROTO(struct i915_request *rq), TP_ARGS(rq) diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c -index e0ae911ef427..781edf550436 100644 +index 3a6fedad002d..4800dfd8a5fb 100644 --- a/drivers/gpu/drm/radeon/radeon_display.c +++ b/drivers/gpu/drm/radeon/radeon_display.c -@@ -1822,6 +1822,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, +@@ -1813,6 +1813,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, struct radeon_device *rdev = dev->dev_private; /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ @@ -3388,7 +3041,7 @@ index e0ae911ef427..781edf550436 100644 /* Get optional system timestamp before query. */ if (stime) -@@ -1914,6 +1915,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, +@@ -1905,6 +1906,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, *etime = ktime_get(); /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ @@ -3397,7 +3050,7 @@ index e0ae911ef427..781edf550436 100644 /* Decode into vertical and horizontal scanout position. */ *vpos = position & 0x1fff; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h -index 40e2b9f91163..d9de4813ffac 100644 +index 9416e09ebd58..4a5767a15544 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -18,6 +18,7 @@ @@ -3409,7 +3062,7 @@ index 40e2b9f91163..d9de4813ffac 100644 #include "hv_trace.h" diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c -index 946d0aba101f..9447315ccd7c 100644 +index d491fdcee61f..e25ffe09400a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -22,6 +22,7 @@ @@ -3420,7 +3073,7 @@ index 946d0aba101f..9447315ccd7c 100644 #include <linux/delay.h> #include <linux/notifier.h> -@@ -1303,6 +1304,8 @@ static void vmbus_isr(void) +@@ -1310,6 +1311,8 @@ static void vmbus_isr(void) void *page_addr = hv_cpu->synic_event_page; struct hv_message *msg; union hv_synic_event_flags *event; @@ -3429,15 +3082,34 @@ index 946d0aba101f..9447315ccd7c 100644 bool handled = false; if (unlikely(page_addr == NULL)) -@@ -1347,7 +1350,7 @@ static void vmbus_isr(void) +@@ -1354,7 +1357,7 @@ static void vmbus_isr(void) tasklet_schedule(&hv_cpu->msg_dpc); } -- add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0); -+ add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip); +- add_interrupt_randomness(hv_get_vector(), 0); ++ add_interrupt_randomness(hv_get_vector(), 0, ip); } /* +@@ -1362,7 +1365,8 @@ static void vmbus_isr(void) + * buffer and call into Hyper-V to transfer the data. + */ + static void hv_kmsg_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + size_t bytes_written; + phys_addr_t panic_pa; +@@ -1377,7 +1381,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, + * Write dump contents to the page. No need to synchronize; panic should + * be single-threaded. + */ +- kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE, ++ kmsg_dump_get_buffer(iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, + &bytes_written); + if (bytes_written) + hyperv_report_panic_msg(panic_pa, bytes_written); diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig index ce9429ca6dde..29ccbd6acf43 100644 --- a/drivers/leds/trigger/Kconfig @@ -3451,10 +3123,10 @@ index ce9429ca6dde..29ccbd6acf43 100644 This allows LEDs to be controlled by active CPUs. This shows the active CPUs across an array of LEDs so you can see which diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c -index 225380efd1e2..7ae33e2edd35 100644 +index 3a90cc0e43ca..6c20c6e9bf1f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c -@@ -2077,8 +2077,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +@@ -2216,8 +2216,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) struct raid5_percpu *percpu; unsigned long cpu; @@ -3465,7 +3137,7 @@ index 225380efd1e2..7ae33e2edd35 100644 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; -@@ -2137,7 +2138,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +@@ -2276,7 +2277,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } @@ -3475,7 +3147,7 @@ index 225380efd1e2..7ae33e2edd35 100644 } static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) -@@ -6902,6 +6904,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) +@@ -7097,6 +7099,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) __func__, cpu); return -ENOMEM; } @@ -3484,10 +3156,10 @@ index 225380efd1e2..7ae33e2edd35 100644 } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h -index 16fc29472f5c..d16edbdcde6e 100644 +index 5c05acf20e1f..665fe138ab4f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h -@@ -627,6 +627,7 @@ struct r5conf { +@@ -635,6 +635,7 @@ struct r5conf { int recovery_disabled; /* per cpu variables */ struct raid5_percpu { @@ -3495,8 +3167,495 @@ index 16fc29472f5c..d16edbdcde6e 100644 struct page *spare_page; /* Used when checking P/Q in raid6 */ void *scribble; /* space for constructing buffer * lists and performing address +diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c +index 774970bfcf85..6bc2c728adb7 100644 +--- a/drivers/mtd/mtdoops.c ++++ b/drivers/mtd/mtdoops.c +@@ -267,7 +267,8 @@ static void find_next_position(struct mtdoops_context *cxt) + } + + static void mtdoops_do_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + struct mtdoops_context *cxt = container_of(dumper, + struct mtdoops_context, dump); +@@ -276,7 +277,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, + if (reason == KMSG_DUMP_OOPS && !dump_oops) + return; + +- kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, ++ kmsg_dump_get_buffer(iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, + record_size - MTDOOPS_HEADER_SIZE, NULL); + + if (reason != KMSG_DUMP_OOPS) { +diff --git a/drivers/net/ethernet/chelsio/cxgb/common.h b/drivers/net/ethernet/chelsio/cxgb/common.h +index 6475060649e9..0321be77366c 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/common.h ++++ b/drivers/net/ethernet/chelsio/cxgb/common.h +@@ -238,7 +238,6 @@ struct adapter { + int msg_enable; + u32 mmio_len; + +- struct work_struct ext_intr_handler_task; + struct adapter_params params; + + /* Terminator modules. */ +@@ -257,6 +256,7 @@ struct adapter { + + /* guards async operations */ + spinlock_t async_lock ____cacheline_aligned; ++ u32 pending_thread_intr; + u32 slow_intr_mask; + int t1powersave; + }; +@@ -334,8 +334,7 @@ void t1_interrupts_enable(adapter_t *adapter); + void t1_interrupts_disable(adapter_t *adapter); + void t1_interrupts_clear(adapter_t *adapter); + int t1_elmer0_ext_intr_handler(adapter_t *adapter); +-void t1_elmer0_ext_intr(adapter_t *adapter); +-int t1_slow_intr_handler(adapter_t *adapter); ++irqreturn_t t1_slow_intr_handler(adapter_t *adapter); + + int t1_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); + const struct board_info *t1_get_board_info(unsigned int board_id); +@@ -347,7 +346,6 @@ int t1_get_board_rev(adapter_t *adapter, const struct board_info *bi, + int t1_init_hw_modules(adapter_t *adapter); + int t1_init_sw_modules(adapter_t *adapter, const struct board_info *bi); + void t1_free_sw_modules(adapter_t *adapter); +-void t1_fatal_err(adapter_t *adapter); + void t1_link_changed(adapter_t *adapter, int port_id); + void t1_link_negotiated(adapter_t *adapter, int port_id, int link_stat, + int speed, int duplex, int pause); +diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +index 0e4a0f413960..512da98019c6 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c ++++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +@@ -211,9 +211,10 @@ static int cxgb_up(struct adapter *adapter) + t1_interrupts_clear(adapter); + + adapter->params.has_msi = !disable_msi && !pci_enable_msi(adapter->pdev); +- err = request_irq(adapter->pdev->irq, t1_interrupt, +- adapter->params.has_msi ? 0 : IRQF_SHARED, +- adapter->name, adapter); ++ err = request_threaded_irq(adapter->pdev->irq, t1_interrupt, ++ t1_interrupt_thread, ++ adapter->params.has_msi ? 0 : IRQF_SHARED, ++ adapter->name, adapter); + if (err) { + if (adapter->params.has_msi) + pci_disable_msi(adapter->pdev); +@@ -916,51 +917,6 @@ static void mac_stats_task(struct work_struct *work) + spin_unlock(&adapter->work_lock); + } + +-/* +- * Processes elmer0 external interrupts in process context. +- */ +-static void ext_intr_task(struct work_struct *work) +-{ +- struct adapter *adapter = +- container_of(work, struct adapter, ext_intr_handler_task); +- +- t1_elmer0_ext_intr_handler(adapter); +- +- /* Now reenable external interrupts */ +- spin_lock_irq(&adapter->async_lock); +- adapter->slow_intr_mask |= F_PL_INTR_EXT; +- writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); +- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, +- adapter->regs + A_PL_ENABLE); +- spin_unlock_irq(&adapter->async_lock); +-} +- +-/* +- * Interrupt-context handler for elmer0 external interrupts. +- */ +-void t1_elmer0_ext_intr(struct adapter *adapter) +-{ +- /* +- * Schedule a task to handle external interrupts as we require +- * a process context. We disable EXT interrupts in the interim +- * and let the task reenable them when it's done. +- */ +- adapter->slow_intr_mask &= ~F_PL_INTR_EXT; +- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, +- adapter->regs + A_PL_ENABLE); +- schedule_work(&adapter->ext_intr_handler_task); +-} +- +-void t1_fatal_err(struct adapter *adapter) +-{ +- if (adapter->flags & FULL_INIT_DONE) { +- t1_sge_stop(adapter->sge); +- t1_interrupts_disable(adapter); +- } +- pr_alert("%s: encountered fatal error, operation suspended\n", +- adapter->name); +-} +- + static const struct net_device_ops cxgb_netdev_ops = { + .ndo_open = cxgb_open, + .ndo_stop = cxgb_close, +@@ -1062,8 +1018,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + spin_lock_init(&adapter->async_lock); + spin_lock_init(&adapter->mac_lock); + +- INIT_WORK(&adapter->ext_intr_handler_task, +- ext_intr_task); + INIT_DELAYED_WORK(&adapter->stats_update_task, + mac_stats_task); + +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c +index 2d9c2b5a690a..cda01f22c71c 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.c ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.c +@@ -940,10 +940,11 @@ void t1_sge_intr_clear(struct sge *sge) + /* + * SGE 'Error' interrupt handler + */ +-int t1_sge_intr_error_handler(struct sge *sge) ++bool t1_sge_intr_error_handler(struct sge *sge) + { + struct adapter *adapter = sge->adapter; + u32 cause = readl(adapter->regs + A_SG_INT_CAUSE); ++ bool wake = false; + + if (adapter->port[0].dev->hw_features & NETIF_F_TSO) + cause &= ~F_PACKET_TOO_BIG; +@@ -967,11 +968,14 @@ int t1_sge_intr_error_handler(struct sge *sge) + sge->stats.pkt_mismatch++; + pr_alert("%s: SGE packet mismatch\n", adapter->name); + } +- if (cause & SGE_INT_FATAL) +- t1_fatal_err(adapter); ++ if (cause & SGE_INT_FATAL) { ++ t1_interrupts_disable(adapter); ++ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; ++ wake = true; ++ } + + writel(cause, adapter->regs + A_SG_INT_CAUSE); +- return 0; ++ return wake; + } + + const struct sge_intr_counts *t1_sge_get_intr_counts(const struct sge *sge) +@@ -1619,11 +1623,46 @@ int t1_poll(struct napi_struct *napi, int budget) + return work_done; + } + ++irqreturn_t t1_interrupt_thread(int irq, void *data) ++{ ++ struct adapter *adapter = data; ++ u32 pending_thread_intr; ++ ++ spin_lock_irq(&adapter->async_lock); ++ pending_thread_intr = adapter->pending_thread_intr; ++ adapter->pending_thread_intr = 0; ++ spin_unlock_irq(&adapter->async_lock); ++ ++ if (!pending_thread_intr) ++ return IRQ_NONE; ++ ++ if (pending_thread_intr & F_PL_INTR_EXT) ++ t1_elmer0_ext_intr_handler(adapter); ++ ++ /* This error is fatal, interrupts remain off */ ++ if (pending_thread_intr & F_PL_INTR_SGE_ERR) { ++ pr_alert("%s: encountered fatal error, operation suspended\n", ++ adapter->name); ++ t1_sge_stop(adapter->sge); ++ return IRQ_HANDLED; ++ } ++ ++ spin_lock_irq(&adapter->async_lock); ++ adapter->slow_intr_mask |= F_PL_INTR_EXT; ++ ++ writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); ++ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, ++ adapter->regs + A_PL_ENABLE); ++ spin_unlock_irq(&adapter->async_lock); ++ ++ return IRQ_HANDLED; ++} ++ + irqreturn_t t1_interrupt(int irq, void *data) + { + struct adapter *adapter = data; + struct sge *sge = adapter->sge; +- int handled; ++ irqreturn_t handled; + + if (likely(responses_pending(adapter))) { + writel(F_PL_INTR_SGE_DATA, adapter->regs + A_PL_CAUSE); +@@ -1645,10 +1684,10 @@ irqreturn_t t1_interrupt(int irq, void *data) + handled = t1_slow_intr_handler(adapter); + spin_unlock(&adapter->async_lock); + +- if (!handled) ++ if (handled == IRQ_NONE) + sge->stats.unhandled_irqs++; + +- return IRQ_RETVAL(handled != 0); ++ return handled; + } + + /* +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.h b/drivers/net/ethernet/chelsio/cxgb/sge.h +index a1ba591b3431..716705b96f26 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.h ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.h +@@ -74,6 +74,7 @@ struct sge *t1_sge_create(struct adapter *, struct sge_params *); + int t1_sge_configure(struct sge *, struct sge_params *); + int t1_sge_set_coalesce_params(struct sge *, struct sge_params *); + void t1_sge_destroy(struct sge *); ++irqreturn_t t1_interrupt_thread(int irq, void *data); + irqreturn_t t1_interrupt(int irq, void *cookie); + int t1_poll(struct napi_struct *, int); + +@@ -81,7 +82,7 @@ netdev_tx_t t1_start_xmit(struct sk_buff *skb, struct net_device *dev); + void t1_vlan_mode(struct adapter *adapter, netdev_features_t features); + void t1_sge_start(struct sge *); + void t1_sge_stop(struct sge *); +-int t1_sge_intr_error_handler(struct sge *); ++bool t1_sge_intr_error_handler(struct sge *sge); + void t1_sge_intr_enable(struct sge *); + void t1_sge_intr_disable(struct sge *); + void t1_sge_intr_clear(struct sge *); +diff --git a/drivers/net/ethernet/chelsio/cxgb/subr.c b/drivers/net/ethernet/chelsio/cxgb/subr.c +index ea0f8741d7cf..310add28fcf5 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/subr.c ++++ b/drivers/net/ethernet/chelsio/cxgb/subr.c +@@ -170,7 +170,7 @@ void t1_link_changed(adapter_t *adapter, int port_id) + t1_link_negotiated(adapter, port_id, link_ok, speed, duplex, fc); + } + +-static int t1_pci_intr_handler(adapter_t *adapter) ++static bool t1_pci_intr_handler(adapter_t *adapter) + { + u32 pcix_cause; + +@@ -179,9 +179,13 @@ static int t1_pci_intr_handler(adapter_t *adapter) + if (pcix_cause) { + pci_write_config_dword(adapter->pdev, A_PCICFG_INTR_CAUSE, + pcix_cause); +- t1_fatal_err(adapter); /* PCI errors are fatal */ ++ /* PCI errors are fatal */ ++ t1_interrupts_disable(adapter); ++ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; ++ pr_alert("%s: PCI error encountered.\n", adapter->name); ++ return true; + } +- return 0; ++ return false; + } + + #ifdef CONFIG_CHELSIO_T1_1G +@@ -210,13 +214,16 @@ static int fpga_phy_intr_handler(adapter_t *adapter) + /* + * Slow path interrupt handler for FPGAs. + */ +-static int fpga_slow_intr(adapter_t *adapter) ++static irqreturn_t fpga_slow_intr(adapter_t *adapter) + { + u32 cause = readl(adapter->regs + A_PL_CAUSE); ++ irqreturn_t ret = IRQ_NONE; + + cause &= ~F_PL_INTR_SGE_DATA; +- if (cause & F_PL_INTR_SGE_ERR) +- t1_sge_intr_error_handler(adapter->sge); ++ if (cause & F_PL_INTR_SGE_ERR) { ++ if (t1_sge_intr_error_handler(adapter->sge)) ++ ret = IRQ_WAKE_THREAD; ++ } + + if (cause & FPGA_PCIX_INTERRUPT_GMAC) + fpga_phy_intr_handler(adapter); +@@ -231,14 +238,19 @@ static int fpga_slow_intr(adapter_t *adapter) + /* Clear TP interrupt */ + writel(tp_cause, adapter->regs + FPGA_TP_ADDR_INTERRUPT_CAUSE); + } +- if (cause & FPGA_PCIX_INTERRUPT_PCIX) +- t1_pci_intr_handler(adapter); ++ if (cause & FPGA_PCIX_INTERRUPT_PCIX) { ++ if (t1_pci_intr_handler(adapter)) ++ ret = IRQ_WAKE_THREAD; ++ } + + /* Clear the interrupts just processed. */ + if (cause) + writel(cause, adapter->regs + A_PL_CAUSE); + +- return cause != 0; ++ if (ret != IRQ_NONE) ++ return ret; ++ ++ return cause == 0 ? IRQ_NONE : IRQ_HANDLED; + } + #endif + +@@ -842,31 +854,45 @@ void t1_interrupts_clear(adapter_t* adapter) + /* + * Slow path interrupt handler for ASICs. + */ +-static int asic_slow_intr(adapter_t *adapter) ++static irqreturn_t asic_slow_intr(adapter_t *adapter) + { + u32 cause = readl(adapter->regs + A_PL_CAUSE); ++ irqreturn_t ret = IRQ_HANDLED; + + cause &= adapter->slow_intr_mask; + if (!cause) +- return 0; +- if (cause & F_PL_INTR_SGE_ERR) +- t1_sge_intr_error_handler(adapter->sge); ++ return IRQ_NONE; ++ if (cause & F_PL_INTR_SGE_ERR) { ++ if (t1_sge_intr_error_handler(adapter->sge)) ++ ret = IRQ_WAKE_THREAD; ++ } + if (cause & F_PL_INTR_TP) + t1_tp_intr_handler(adapter->tp); + if (cause & F_PL_INTR_ESPI) + t1_espi_intr_handler(adapter->espi); +- if (cause & F_PL_INTR_PCIX) +- t1_pci_intr_handler(adapter); +- if (cause & F_PL_INTR_EXT) +- t1_elmer0_ext_intr(adapter); ++ if (cause & F_PL_INTR_PCIX) { ++ if (t1_pci_intr_handler(adapter)) ++ ret = IRQ_WAKE_THREAD; ++ } ++ if (cause & F_PL_INTR_EXT) { ++ /* Wake the threaded interrupt to handle external interrupts as ++ * we require a process context. We disable EXT interrupts in ++ * the interim and let the thread reenable them when it's done. ++ */ ++ adapter->pending_thread_intr |= F_PL_INTR_EXT; ++ adapter->slow_intr_mask &= ~F_PL_INTR_EXT; ++ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, ++ adapter->regs + A_PL_ENABLE); ++ ret = IRQ_WAKE_THREAD; ++ } + + /* Clear the interrupts just processed. */ + writel(cause, adapter->regs + A_PL_CAUSE); + readl(adapter->regs + A_PL_CAUSE); /* flush writes */ +- return 1; ++ return ret; + } + +-int t1_slow_intr_handler(adapter_t *adapter) ++irqreturn_t t1_slow_intr_handler(adapter_t *adapter) + { + #ifdef CONFIG_CHELSIO_T1_1G + if (!t1_is_asic(adapter)) +diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c +index e3a8858915b3..df0eab479d51 100644 +--- a/drivers/net/ethernet/dlink/sundance.c ++++ b/drivers/net/ethernet/dlink/sundance.c +@@ -963,7 +963,7 @@ static void tx_timeout(struct net_device *dev, unsigned int txqueue) + unsigned long flag; + + netif_stop_queue(dev); +- tasklet_disable(&np->tx_tasklet); ++ tasklet_disable_in_atomic(&np->tx_tasklet); + iowrite16(0, ioaddr + IntrEnable); + printk(KERN_WARNING "%s: Transmit timed out, TxStatus %2.2x " + "TxFrameId %2.2x," +diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c +index e9efe074edc1..f1b9284e0bea 100644 +--- a/drivers/net/ethernet/jme.c ++++ b/drivers/net/ethernet/jme.c +@@ -1265,9 +1265,9 @@ jme_stop_shutdown_timer(struct jme_adapter *jme) + jwrite32f(jme, JME_APMC, apmc); + } + +-static void jme_link_change_tasklet(struct tasklet_struct *t) ++static void jme_link_change_work(struct work_struct *work) + { +- struct jme_adapter *jme = from_tasklet(jme, t, linkch_task); ++ struct jme_adapter *jme = container_of(work, struct jme_adapter, linkch_task); + struct net_device *netdev = jme->dev; + int rc; + +@@ -1510,7 +1510,7 @@ jme_intr_msi(struct jme_adapter *jme, u32 intrstat) + * all other events are ignored + */ + jwrite32(jme, JME_IEVE, intrstat); +- tasklet_schedule(&jme->linkch_task); ++ schedule_work(&jme->linkch_task); + goto out_reenable; + } + +@@ -1832,7 +1832,6 @@ jme_open(struct net_device *netdev) + jme_clear_pm_disable_wol(jme); + JME_NAPI_ENABLE(jme); + +- tasklet_setup(&jme->linkch_task, jme_link_change_tasklet); + tasklet_setup(&jme->txclean_task, jme_tx_clean_tasklet); + tasklet_setup(&jme->rxclean_task, jme_rx_clean_tasklet); + tasklet_setup(&jme->rxempty_task, jme_rx_empty_tasklet); +@@ -1920,7 +1919,7 @@ jme_close(struct net_device *netdev) + + JME_NAPI_DISABLE(jme); + +- tasklet_kill(&jme->linkch_task); ++ cancel_work_sync(&jme->linkch_task); + tasklet_kill(&jme->txclean_task); + tasklet_kill(&jme->rxclean_task); + tasklet_kill(&jme->rxempty_task); +@@ -3035,6 +3034,7 @@ jme_init_one(struct pci_dev *pdev, + atomic_set(&jme->rx_empty, 1); + + tasklet_setup(&jme->pcc_task, jme_pcc_tasklet); ++ INIT_WORK(&jme->linkch_task, jme_link_change_work); + jme->dpi.cur = PCC_P1; + + jme->reg_ghc = 0; +diff --git a/drivers/net/ethernet/jme.h b/drivers/net/ethernet/jme.h +index a2c3b00d939d..2af76329b4a2 100644 +--- a/drivers/net/ethernet/jme.h ++++ b/drivers/net/ethernet/jme.h +@@ -411,7 +411,7 @@ struct jme_adapter { + struct tasklet_struct rxempty_task; + struct tasklet_struct rxclean_task; + struct tasklet_struct txclean_task; +- struct tasklet_struct linkch_task; ++ struct work_struct linkch_task; + struct tasklet_struct pcc_task; + unsigned long flags; + u32 reg_txcs; +diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c +index 71e2ada86793..72e2e71aac0e 100644 +--- a/drivers/net/wireless/ath/ath9k/beacon.c ++++ b/drivers/net/wireless/ath/ath9k/beacon.c +@@ -251,7 +251,7 @@ void ath9k_beacon_ensure_primary_slot(struct ath_softc *sc) + int first_slot = ATH_BCBUF; + int slot; + +- tasklet_disable(&sc->bcon_tasklet); ++ tasklet_disable_in_atomic(&sc->bcon_tasklet); + + /* Find first taken slot. */ + for (slot = 0; slot < ATH_BCBUF; slot++) { +diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c +index 6db8d96a78eb..316e8b9b1338 100644 +--- a/drivers/pci/controller/pci-hyperv.c ++++ b/drivers/pci/controller/pci-hyperv.c +@@ -1458,7 +1458,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) + * Prevents hv_pci_onchannelcallback() from running concurrently + * in the tasklet. + */ +- tasklet_disable(&channel->callback_event); ++ tasklet_disable_in_atomic(&channel->callback_event); + + /* + * Since this function is called with IRQ locks held, can't diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c -index 0f9274960dc6..dc97e4f1f4ad 100644 +index 03bf49adaafe..52e0a2486b43 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1452,11 +1452,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, @@ -3576,10 +3735,10 @@ index 5ea426effa60..0d6b9acc7cf8 100644 list_for_each_entry_safe(fcf, next, &del_list, list) { /* Removes fcf from current list */ diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c -index 96a2952cf626..1cb773a4a02e 100644 +index 841000445b9a..26d661ddc950 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c -@@ -826,10 +826,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, +@@ -825,10 +825,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, } memset(ep, 0, sizeof(*ep)); @@ -3712,10 +3871,10 @@ index cae61d1ebec5..47dd23056271 100644 .device = uart_console_device, .setup = univ8250_console_setup, diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c -index 0d0c80905c58..f8116a8839d8 100644 +index fbcc90c31ca1..b33cb454ce03 100644 --- a/drivers/tty/serial/8250/8250_fsl.c +++ b/drivers/tty/serial/8250/8250_fsl.c -@@ -53,9 +53,18 @@ int fsl8250_handle_irq(struct uart_port *port) +@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) /* Stop processing interrupts on input overrun */ if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { @@ -3735,7 +3894,7 @@ index 0d0c80905c58..f8116a8839d8 100644 port->ops->stop_rx(port); } else { diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c -index dde766fa465f..f4cceca82748 100644 +index 988bf6bcce42..bcd26d672539 100644 --- a/drivers/tty/serial/8250/8250_ingenic.c +++ b/drivers/tty/serial/8250/8250_ingenic.c @@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", @@ -3761,7 +3920,7 @@ index dde766fa465f..f4cceca82748 100644 if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c -index 7b0dec14c8b8..d323ff7051f6 100644 +index f7d3023f860f..8133713dcf5e 100644 --- a/drivers/tty/serial/8250/8250_mtk.c +++ b/drivers/tty/serial/8250/8250_mtk.c @@ -213,12 +213,37 @@ static void mtk8250_shutdown(struct uart_port *port) @@ -3805,7 +3964,7 @@ index 7b0dec14c8b8..d323ff7051f6 100644 static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c -index c71d647eb87a..4325a15be573 100644 +index b0af13074cd3..b05f8c34b291 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -757,7 +757,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) @@ -3887,7 +4046,7 @@ index c71d647eb87a..4325a15be573 100644 spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); -@@ -2768,7 +2761,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, +@@ -2771,7 +2764,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; @@ -3896,7 +4055,7 @@ index c71d647eb87a..4325a15be573 100644 if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; -@@ -3234,7 +3227,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); +@@ -3237,7 +3230,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); #ifdef CONFIG_SERIAL_8250_CONSOLE @@ -3905,7 +4064,7 @@ index c71d647eb87a..4325a15be573 100644 { struct uart_8250_port *up = up_to_u8250p(port); -@@ -3242,6 +3235,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) +@@ -3245,6 +3238,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) serial_port_out(port, UART_TX, ch); } @@ -3924,7 +4083,7 @@ index c71d647eb87a..4325a15be573 100644 /* * Restore serial console when h/w power-off detected */ -@@ -3263,6 +3268,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) +@@ -3266,6 +3271,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); } @@ -3957,7 +4116,7 @@ index c71d647eb87a..4325a15be573 100644 /* * Print a string to the serial port trying not to disturb * any possible real use of the port... -@@ -3279,24 +3310,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3282,24 +3313,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, struct uart_port *port = &up->port; unsigned long flags; unsigned int ier; @@ -3984,7 +4143,7 @@ index c71d647eb87a..4325a15be573 100644 /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { -@@ -3310,7 +3329,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3313,7 +3332,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, mdelay(port->rs485.delay_rts_before_send); } @@ -3994,7 +4153,7 @@ index c71d647eb87a..4325a15be573 100644 /* * Finally, wait for transmitter to become empty -@@ -3323,8 +3344,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3326,8 +3347,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (em485->tx_stopped) up->rs485_stop_tx(up); } @@ -4004,7 +4163,7 @@ index c71d647eb87a..4325a15be573 100644 /* * The receive handling will happen properly because the -@@ -3336,8 +3356,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3339,8 +3359,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (up->msr_saved_flags) serial8250_modem_status(up); @@ -4014,7 +4173,7 @@ index c71d647eb87a..4325a15be573 100644 } static unsigned int probe_baud(struct uart_port *port) -@@ -3357,6 +3376,7 @@ static unsigned int probe_baud(struct uart_port *port) +@@ -3360,6 +3379,7 @@ static unsigned int probe_baud(struct uart_port *port) int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { @@ -4022,7 +4181,7 @@ index c71d647eb87a..4325a15be573 100644 int baud = 9600; int bits = 8; int parity = 'n'; -@@ -3366,6 +3386,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) +@@ -3369,6 +3389,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) if (!port->iobase && !port->membase) return -ENODEV; @@ -4032,10 +4191,10 @@ index c71d647eb87a..4325a15be573 100644 uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c -index 67498594d7d7..e5cf42bbabf3 100644 +index c255476cce28..8131baf3362c 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c -@@ -2198,18 +2198,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) +@@ -2201,18 +2201,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; @@ -4064,7 +4223,7 @@ index 67498594d7d7..e5cf42bbabf3 100644 /* * First save the CR then disable the interrupts -@@ -2235,8 +2241,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) +@@ -2238,8 +2244,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) pl011_write(old_cr, uap, REG_CR); if (locked) @@ -4119,10 +4278,10 @@ index 04f75a44f243..60cbce1995a5 100644 _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c -index 6df0922e7e30..5b2371b26bf8 100644 +index 80bf4c6f4c7b..a975cfed14f5 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c -@@ -81,7 +81,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, +@@ -82,7 +82,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct inode *inode; struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -4132,10 +4291,10 @@ index 6df0922e7e30..5b2371b26bf8 100644 cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); diff --git a/fs/dcache.c b/fs/dcache.c -index ea0485861d93..26a187abf13a 100644 +index d429c984133c..28474a11c8f3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c -@@ -2503,9 +2503,10 @@ EXPORT_SYMBOL(d_rehash); +@@ -2511,9 +2511,10 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { @@ -4148,7 +4307,7 @@ index ea0485861d93..26a187abf13a 100644 return n; cpu_relax(); } -@@ -2513,26 +2514,30 @@ static inline unsigned start_dir_add(struct inode *dir) +@@ -2521,26 +2522,30 @@ static inline unsigned start_dir_add(struct inode *dir) static inline void end_dir_add(struct inode *dir, unsigned n) { @@ -4191,7 +4350,7 @@ index ea0485861d93..26a187abf13a 100644 { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); -@@ -2546,7 +2551,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, +@@ -2554,7 +2559,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, retry: rcu_read_lock(); @@ -4200,7 +4359,7 @@ index ea0485861d93..26a187abf13a 100644 r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { -@@ -2574,7 +2579,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, +@@ -2582,7 +2587,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, } hlist_bl_lock(b); @@ -4209,7 +4368,7 @@ index ea0485861d93..26a187abf13a 100644 hlist_bl_unlock(b); rcu_read_unlock(); goto retry; -@@ -2647,7 +2652,7 @@ void __d_lookup_done(struct dentry *dentry) +@@ -2655,7 +2660,7 @@ void __d_lookup_done(struct dentry *dentry) hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); @@ -4218,39 +4377,8 @@ index ea0485861d93..26a187abf13a 100644 dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); -diff --git a/fs/exec.c b/fs/exec.c -index a91003e28eaa..d4fb18baf1fb 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -1130,11 +1130,24 @@ static int exec_mmap(struct mm_struct *mm) - } - - task_lock(tsk); -- active_mm = tsk->active_mm; - membarrier_exec_mmap(mm); -- tsk->mm = mm; -+ -+ local_irq_disable(); -+ active_mm = tsk->active_mm; - tsk->active_mm = mm; -+ tsk->mm = mm; -+ /* -+ * This prevents preemption while active_mm is being loaded and -+ * it and mm are being updated, which could cause problems for -+ * lazy tlb mm refcounting when these are updated by context -+ * switches. Not all architectures can handle irqs off over -+ * activate_mm yet. -+ */ -+ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) -+ local_irq_enable(); - activate_mm(active_mm, mm); -+ if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) -+ local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); - task_unlock(tsk); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c -index 90e3f01bd796..ec8fd82117f4 100644 +index 3441ffa740f3..2fcae5cfd272 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -158,7 +158,7 @@ static int fuse_direntplus_link(struct file *file, @@ -4263,11 +4391,11 @@ index 90e3f01bd796..ec8fd82117f4 100644 if (!o->nodeid) { /* diff --git a/fs/inode.c b/fs/inode.c -index 72c4c347afb7..1b27d29265cf 100644 +index 497326faa124..c93acfaa84f8 100644 --- a/fs/inode.c +++ b/fs/inode.c -@@ -158,7 +158,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) - inode->i_bdev = NULL; +@@ -157,7 +157,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) + inode->i_pipe = NULL; inode->i_cdev = NULL; inode->i_link = NULL; - inode->i_dir_seq = 0; @@ -4275,214 +4403,8 @@ index 72c4c347afb7..1b27d29265cf 100644 inode->i_rdev = 0; inode->dirtied_when = 0; -diff --git a/fs/io-wq.c b/fs/io-wq.c -index 414beb543883..f1cf739d63c2 100644 ---- a/fs/io-wq.c -+++ b/fs/io-wq.c -@@ -87,7 +87,7 @@ enum { - */ - struct io_wqe { - struct { -- spinlock_t lock; -+ raw_spinlock_t lock; - struct io_wq_work_list work_list; - unsigned long hash_map; - unsigned flags; -@@ -148,7 +148,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) - - if (current->files != worker->restore_files) { - __acquire(&wqe->lock); -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - - task_lock(current); -@@ -166,7 +166,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) - if (worker->mm) { - if (!dropped_lock) { - __acquire(&wqe->lock); -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - } - __set_current_state(TASK_RUNNING); -@@ -220,17 +220,17 @@ static void io_worker_exit(struct io_worker *worker) - worker->flags = 0; - preempt_enable(); - -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - hlist_nulls_del_rcu(&worker->nulls_node); - list_del_rcu(&worker->all_list); - if (__io_worker_unuse(wqe, worker)) { - __release(&wqe->lock); -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - } - acct->nr_workers--; - nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - - /* all workers gone, wq exit can proceed */ - if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) -@@ -504,7 +504,7 @@ static void io_worker_handle_work(struct io_worker *worker) - else if (!wq_list_empty(&wqe->work_list)) - wqe->flags |= IO_WQE_FLAG_STALLED; - -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - if (!work) - break; - io_assign_current_work(worker, work); -@@ -538,17 +538,17 @@ static void io_worker_handle_work(struct io_worker *worker) - io_wqe_enqueue(wqe, linked); - - if (hash != -1U && !next_hashed) { -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - wqe->hash_map &= ~BIT_ULL(hash); - wqe->flags &= ~IO_WQE_FLAG_STALLED; - /* skip unnecessary unlock-lock wqe->lock */ - if (!work) - goto get_next; -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - } - } while (work); - -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - } while (1); - } - -@@ -563,7 +563,7 @@ static int io_wqe_worker(void *data) - while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - set_current_state(TASK_INTERRUPTIBLE); - loop: -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - if (io_wqe_run_queue(wqe)) { - __set_current_state(TASK_RUNNING); - io_worker_handle_work(worker); -@@ -574,7 +574,7 @@ static int io_wqe_worker(void *data) - __release(&wqe->lock); - goto loop; - } -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - if (signal_pending(current)) - flush_signals(current); - if (schedule_timeout(WORKER_IDLE_TIMEOUT)) -@@ -586,11 +586,11 @@ static int io_wqe_worker(void *data) - } - - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - if (!wq_list_empty(&wqe->work_list)) - io_worker_handle_work(worker); - else -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - } - - io_worker_exit(worker); -@@ -630,9 +630,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk) - - worker->flags &= ~IO_WORKER_F_RUNNING; - -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - io_wqe_dec_running(wqe, worker); -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - } - - static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) -@@ -656,7 +656,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) - return false; - } - -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - list_add_tail_rcu(&worker->all_list, &wqe->all_list); - worker->flags |= IO_WORKER_F_FREE; -@@ -665,7 +665,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) - if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) - worker->flags |= IO_WORKER_F_FIXED; - acct->nr_workers++; -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - - if (index == IO_WQ_ACCT_UNBOUND) - atomic_inc(&wq->user->processes); -@@ -720,12 +720,12 @@ static int io_wq_manager(void *data) - if (!node_online(node)) - continue; - -- spin_lock_irq(&wqe->lock); -+ raw_spin_lock_irq(&wqe->lock); - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) - fork_worker[IO_WQ_ACCT_BOUND] = true; - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) - fork_worker[IO_WQ_ACCT_UNBOUND] = true; -- spin_unlock_irq(&wqe->lock); -+ raw_spin_unlock_irq(&wqe->lock); - if (fork_worker[IO_WQ_ACCT_BOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); - if (fork_worker[IO_WQ_ACCT_UNBOUND]) -@@ -821,10 +821,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) - } - - work_flags = work->flags; -- spin_lock_irqsave(&wqe->lock, flags); -+ raw_spin_lock_irqsave(&wqe->lock, flags); - io_wqe_insert_work(wqe, work); - wqe->flags &= ~IO_WQE_FLAG_STALLED; -- spin_unlock_irqrestore(&wqe->lock, flags); -+ raw_spin_unlock_irqrestore(&wqe->lock, flags); - - if ((work_flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running)) -@@ -951,13 +951,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - unsigned long flags; - - retry: -- spin_lock_irqsave(&wqe->lock, flags); -+ raw_spin_lock_irqsave(&wqe->lock, flags); - wq_list_for_each(node, prev, &wqe->work_list) { - work = container_of(node, struct io_wq_work, list); - if (!match->fn(work, match->data)) - continue; - io_wqe_remove_pending(wqe, work, prev); -- spin_unlock_irqrestore(&wqe->lock, flags); -+ raw_spin_unlock_irqrestore(&wqe->lock, flags); - io_run_cancel(work, wqe); - match->nr_pending++; - if (!match->cancel_all) -@@ -966,7 +966,7 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - /* not safe to continue after unlock */ - goto retry; - } -- spin_unlock_irqrestore(&wqe->lock, flags); -+ raw_spin_unlock_irqrestore(&wqe->lock, flags); - } - - static void io_wqe_cancel_running_work(struct io_wqe *wqe, -@@ -1074,7 +1074,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) - } - atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); - wqe->wq = wq; -- spin_lock_init(&wqe->lock); -+ raw_spin_lock_init(&wqe->lock); - INIT_WQ_LIST(&wqe->work_list); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); diff --git a/fs/namei.c b/fs/namei.c -index e99e2a9da0f7..8b7cbca327de 100644 +index 78443a85480a..ad687cecc61c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1520,7 +1520,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, @@ -4494,7 +4416,7 @@ index e99e2a9da0f7..8b7cbca327de 100644 /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) -@@ -3018,7 +3018,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, +@@ -3021,7 +3021,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; @@ -4504,14 +4426,14 @@ index e99e2a9da0f7..8b7cbca327de 100644 if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); diff --git a/fs/namespace.c b/fs/namespace.c -index bae0e95b3713..c7e53f6ba7df 100644 +index eed3453ec40a..c073fa5754fc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -14,6 +14,7 @@ #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> -+#include <linux/delay.h> ++#include <linux/hrtimer.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> @@ -4530,10 +4452,10 @@ index bae0e95b3713..c7e53f6ba7df 100644 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c -index cb52db9a0cfb..b5703ca9492c 100644 +index ef827ae193d2..87484e36ac50 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c -@@ -484,7 +484,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, +@@ -635,7 +635,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); @@ -4542,7 +4464,7 @@ index cb52db9a0cfb..b5703ca9492c 100644 struct dentry *dentry; struct dentry *alias; struct inode *inode; -@@ -1665,7 +1665,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, +@@ -1859,7 +1859,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { @@ -4573,24 +4495,8 @@ index b27ebdccef70..f86c98a7ed04 100644 status = -EBUSY; spin_lock(&dentry->d_lock); -diff --git a/fs/proc/array.c b/fs/proc/array.c -index 65ec2029fa80..7052441be967 100644 ---- a/fs/proc/array.c -+++ b/fs/proc/array.c -@@ -382,9 +382,9 @@ static inline void task_context_switch_counts(struct seq_file *m, - static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) - { - seq_printf(m, "Cpus_allowed:\t%*pb\n", -- cpumask_pr_args(task->cpus_ptr)); -+ cpumask_pr_args(&task->cpus_mask)); - seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", -- cpumask_pr_args(task->cpus_ptr)); -+ cpumask_pr_args(&task->cpus_mask)); - } - - static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) diff --git a/fs/proc/base.c b/fs/proc/base.c -index 617db4e0faa0..9e4520113c02 100644 +index bda8e8ece720..c8698a7de321 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ @@ -4601,7 +4507,7 @@ index 617db4e0faa0..9e4520113c02 100644 #include <trace/events/oom.h> #include "internal.h" #include "fd.h" -@@ -2033,7 +2034,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, +@@ -2038,7 +2039,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_hash_and_lookup(dir, &qname); if (!child) { @@ -4611,10 +4517,10 @@ index 617db4e0faa0..9e4520113c02 100644 if (IS_ERR(child)) goto end_instantiate; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c -index 6c1166ccdaea..dab5f1749ef1 100644 +index d2018f70d1fa..59d715b66468 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c -@@ -685,7 +685,7 @@ static bool proc_sys_fill_cache(struct file *file, +@@ -683,7 +683,7 @@ static bool proc_sys_fill_cache(struct file *file, child = d_lookup(dir, &qname); if (!child) { @@ -4623,6 +4529,29 @@ index 6c1166ccdaea..dab5f1749ef1 100644 child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; +diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c +index 32f64abc277c..7906d0a2fb8a 100644 +--- a/fs/pstore/platform.c ++++ b/fs/pstore/platform.c +@@ -383,7 +383,8 @@ void pstore_record_init(struct pstore_record *record, + * end of the buffer. + */ + static void pstore_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + unsigned long total = 0; + const char *why; +@@ -435,7 +436,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, + dst_size -= header_size; + + /* Write dump contents. */ +- if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, ++ if (!kmsg_dump_get_buffer(iter, true, dst + header_size, + dst_size, &dump_size)) + break; + diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index d683f5e6d791..71c1535db56a 100644 --- a/include/asm-generic/preempt.h @@ -4637,6 +4566,19 @@ index d683f5e6d791..71c1535db56a 100644 extern asmlinkage void preempt_schedule(void); #define __preempt_schedule() preempt_schedule() extern asmlinkage void preempt_schedule_notrace(void); +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index f94ee3089e01..89a444c5a583 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -153,7 +153,7 @@ struct request { + */ + union { + struct hlist_node hash; /* merge hash */ +- struct list_head ipi_list; ++ struct llist_node ipi_list; + }; + + /* diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index a19519f4241d..eed86eb0a1de 100644 --- a/include/linux/bottom_half.h @@ -4662,10 +4604,18 @@ index a19519f4241d..eed86eb0a1de 100644 + #endif /* _LINUX_BH_H */ diff --git a/include/linux/console.h b/include/linux/console.h -index 0670d3491e0e..00d7437a92e1 100644 +index 20874db50bc8..69bfff368294 100644 --- a/include/linux/console.h +++ b/include/linux/console.h -@@ -137,10 +137,12 @@ static inline int con_debug_leave(void) +@@ -16,6 +16,7 @@ + + #include <linux/atomic.h> + #include <linux/types.h> ++#include <linux/printk.h> + + struct vc_data; + struct console_font_op; +@@ -136,10 +137,12 @@ static inline int con_debug_leave(void) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ @@ -4674,20 +4624,23 @@ index 0670d3491e0e..00d7437a92e1 100644 struct console { char name[16]; void (*write)(struct console *, const char *, unsigned); -+ void (*write_atomic)(struct console *, const char *, unsigned); ++ void (*write_atomic)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); -@@ -150,6 +152,8 @@ struct console { +@@ -149,6 +152,11 @@ struct console { short flags; short index; int cflag; ++#ifdef CONFIG_PRINTK ++ char sync_buf[CONSOLE_LOG_MAX]; ++#endif + atomic64_t printk_seq; + struct task_struct *thread; void *data; struct console *next; }; -@@ -230,4 +234,7 @@ extern void console_init(void); +@@ -229,4 +237,7 @@ extern void console_init(void); void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); @@ -4695,61 +4648,11 @@ index 0670d3491e0e..00d7437a92e1 100644 +extern void console_atomic_unlock(unsigned int flags); + #endif /* _LINUX_CONSOLE_H */ -diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h -index bf9181cef444..26b29e78dbc6 100644 ---- a/include/linux/cpuhotplug.h -+++ b/include/linux/cpuhotplug.h -@@ -151,6 +151,7 @@ enum cpuhp_state { - CPUHP_AP_ONLINE, - CPUHP_TEARDOWN_CPU, - CPUHP_AP_ONLINE_IDLE, -+ CPUHP_AP_SCHED_WAIT_EMPTY, - CPUHP_AP_SMPBOOT_THREADS, - CPUHP_AP_X86_VDSO_VMA_ONLINE, - CPUHP_AP_IRQ_AFFINITY_ONLINE, -diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h -index f0d895d6ac39..383684e30f12 100644 ---- a/include/linux/cpumask.h -+++ b/include/linux/cpumask.h -@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p, - return cpumask_next_and(-1, src1p, src2p); - } - -+static inline int cpumask_any_distribute(const struct cpumask *srcp) -+{ -+ return cpumask_first(srcp); -+} -+ - #define for_each_cpu(cpu, mask) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) - #define for_each_cpu_not(cpu, mask) \ -@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); - unsigned int cpumask_local_spread(unsigned int i, int node); - int cpumask_any_and_distribute(const struct cpumask *src1p, - const struct cpumask *src2p); -+int cpumask_any_distribute(const struct cpumask *srcp); - - /** - * for_each_cpu - iterate over every cpu in a mask -diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h -index 6594dbc34a37..206bde8308b2 100644 ---- a/include/linux/crash_core.h -+++ b/include/linux/crash_core.h -@@ -55,6 +55,9 @@ phys_addr_t paddr_vmcoreinfo_note(void); - #define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -+#define VMCOREINFO_TYPE_OFFSET(name, field) \ -+ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ -+ (unsigned long)offsetof(name, field)) - #define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) - #define VMCOREINFO_NUMBER(name) \ diff --git a/include/linux/dcache.h b/include/linux/dcache.h -index 65d975bf9390..1e23dd02ac4e 100644 +index d7b369fc15d3..d7d8d9a69ecf 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h -@@ -106,7 +106,7 @@ struct dentry { +@@ -107,7 +107,7 @@ struct dentry { union { struct list_head d_lru; /* LRU list */ @@ -4758,7 +4661,7 @@ index 65d975bf9390..1e23dd02ac4e 100644 }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ -@@ -238,7 +238,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op +@@ -239,7 +239,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, @@ -4768,73 +4671,38 @@ index 65d975bf9390..1e23dd02ac4e 100644 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h -index e7e45f0cc7da..5a9e3e3769ce 100644 +index 2915f56ad421..5a9e3e3769ce 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h -@@ -2,9 +2,8 @@ - #ifndef __LINUX_DEBUG_LOCKING_H +@@ -3,8 +3,7 @@ #define __LINUX_DEBUG_LOCKING_H --#include <linux/kernel.h> #include <linux/atomic.h> -#include <linux/bug.h> +-#include <linux/printk.h> +#include <linux/cache.h> struct task_struct; -diff --git a/include/linux/delay.h b/include/linux/delay.h -index 1d0e2ce6b6d9..02b37178b54f 100644 ---- a/include/linux/delay.h -+++ b/include/linux/delay.h -@@ -76,4 +76,10 @@ static inline void fsleep(unsigned long usecs) - msleep(DIV_ROUND_UP(usecs, 1000)); - } - -+#ifdef CONFIG_PREEMPT_RT -+extern void cpu_chill(void); -+#else -+# define cpu_chill() cpu_relax() -+#endif -+ - #endif /* defined(_LINUX_DELAY_H) */ -diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h -index 3028b644b4fb..6f009559ee54 100644 ---- a/include/linux/dev_printk.h -+++ b/include/linux/dev_printk.h -@@ -21,6 +21,14 @@ - - struct device; - -+#define PRINTK_INFO_SUBSYSTEM_LEN 16 -+#define PRINTK_INFO_DEVICE_LEN 48 -+ -+struct dev_printk_info { -+ char subsystem[PRINTK_INFO_SUBSYSTEM_LEN]; -+ char device[PRINTK_INFO_DEVICE_LEN]; -+}; -+ - #ifdef CONFIG_PRINTK - - __printf(3, 0) __cold diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h -index 159c7476b11b..7f5b22e7c354 100644 +index a104b298019a..1176ed623406 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h -@@ -69,7 +69,7 @@ +@@ -58,7 +58,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ -- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \ -+ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | \ +- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ++ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/fs.h b/include/linux/fs.h -index 7519ae003a08..946b42e7c72d 100644 +index 91f3fbe5b57f..6b497344616b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h -@@ -704,7 +704,7 @@ struct inode { - struct block_device *i_bdev; +@@ -698,7 +698,7 @@ struct inode { + struct pipe_inode_info *i_pipe; struct cdev *i_cdev; char *i_link; - unsigned i_dir_seq; @@ -4843,17 +4711,17 @@ index 7519ae003a08..946b42e7c72d 100644 __u32 i_generation; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h -index 754f67ac4326..41408d5e4014 100644 +index 7c9d6a2d7e90..76878b357ffa 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h -@@ -8,6 +8,7 @@ +@@ -6,6 +6,7 @@ + #include <linux/preempt.h> + #include <linux/lockdep.h> #include <linux/ftrace_irq.h> ++#include <linux/sched.h> #include <linux/vtime.h> #include <asm/hardirq.h> -+#include <linux/sched.h> - extern void synchronize_irq(unsigned int irq); - extern bool synchronize_hardirq(unsigned int irq); @@ -115,7 +116,6 @@ extern void rcu_nmi_exit(void); do { \ lockdep_off(); \ @@ -4870,106 +4738,92 @@ index 754f67ac4326..41408d5e4014 100644 arch_nmi_exit(); \ lockdep_on(); \ } while (0) -diff --git a/include/linux/highmem.h b/include/linux/highmem.h -index 14e6202ce47f..aa41a6b3d482 100644 ---- a/include/linux/highmem.h -+++ b/include/linux/highmem.h -@@ -8,6 +8,7 @@ - #include <linux/mm.h> - #include <linux/uaccess.h> - #include <linux/hardirq.h> -+#include <linux/sched.h> - - #include <asm/cacheflush.h> +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +index 1bbe96dc8be6..d8b7b42b13d4 100644 +--- a/include/linux/highmem-internal.h ++++ b/include/linux/highmem-internal.h +@@ -90,7 +90,11 @@ static inline void __kunmap_local(void *vaddr) -@@ -83,7 +84,7 @@ static inline void kunmap(struct page *page) - */ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - preempt_disable(); -+ migrate_disable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_disable(); ++ else ++ preempt_disable(); ++ pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); -@@ -153,7 +154,7 @@ static inline void kunmap(struct page *page) + return __kmap_local_page_prot(page, prot); + } +@@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct page *page) - static inline void *kmap_atomic(struct page *page) + static inline void *kmap_atomic_pfn(unsigned long pfn) { - preempt_disable(); -+ migrate_disable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_disable(); ++ else ++ preempt_disable(); ++ pagefault_disable(); - return page_address(page); + return __kmap_local_pfn_prot(pfn, kmap_prot); } -@@ -178,32 +179,51 @@ static inline void kunmap_atomic_high(void *addr) - - #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) - -+#ifndef CONFIG_PREEMPT_RT - DECLARE_PER_CPU(int, __kmap_atomic_idx); -+#endif - - static inline int kmap_atomic_idx_push(void) +@@ -111,7 +119,10 @@ static inline void __kunmap_atomic(void *addr) { -+#ifndef CONFIG_PREEMPT_RT - int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; - --#ifdef CONFIG_DEBUG_HIGHMEM -+# ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx >= KM_TYPE_NR); --#endif -+# endif - return idx; -+#else -+ current->kmap_idx++; -+ BUG_ON(current->kmap_idx > KM_TYPE_NR); -+ return current->kmap_idx - 1; -+#endif + kunmap_local_indexed(addr); + pagefault_enable(); +- preempt_enable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_enable(); ++ else ++ preempt_enable(); } - static inline int kmap_atomic_idx(void) - { -+#ifndef CONFIG_PREEMPT_RT - return __this_cpu_read(__kmap_atomic_idx) - 1; -+#else -+ return current->kmap_idx - 1; -+#endif - } + unsigned int __nr_free_highpages(void); +@@ -184,7 +195,10 @@ static inline void __kunmap_local(void *addr) - static inline void kmap_atomic_idx_pop(void) + static inline void *kmap_atomic(struct page *page) { --#ifdef CONFIG_DEBUG_HIGHMEM -+#ifndef CONFIG_PREEMPT_RT -+# ifdef CONFIG_DEBUG_HIGHMEM - int idx = __this_cpu_dec_return(__kmap_atomic_idx); - - BUG_ON(idx < 0); --#else -+# else - __this_cpu_dec(__kmap_atomic_idx); -+# endif -+#else -+ current->kmap_idx--; -+# ifdef CONFIG_DEBUG_HIGHMEM -+ BUG_ON(current->kmap_idx < 0); -+# endif +- preempt_disable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_disable(); ++ else ++ preempt_disable(); + pagefault_disable(); + return page_address(page); + } +@@ -205,7 +219,10 @@ static inline void __kunmap_atomic(void *addr) + kunmap_flush_on_unmap(addr); #endif + pagefault_enable(); +- preempt_enable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_enable(); ++ else ++ preempt_enable(); } -@@ -218,7 +238,7 @@ do { \ - BUILD_BUG_ON(__same_type((addr), struct page *)); \ - kunmap_atomic_high(addr); \ - pagefault_enable(); \ -- preempt_enable(); \ -+ migrate_enable(); \ - } while (0) - + static inline unsigned int nr_free_highpages(void) { return 0; } +diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h +index bb5e7b0a4274..e425a26a5ed8 100644 +--- a/include/linux/hrtimer.h ++++ b/include/linux/hrtimer.h +@@ -540,4 +540,10 @@ int hrtimers_dead_cpu(unsigned int cpu); + #define hrtimers_dead_cpu NULL + #endif ++#ifdef CONFIG_PREEMPT_RT ++extern void cpu_chill(void); ++#else ++# define cpu_chill() cpu_relax() ++#endif ++ + #endif diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h -index f9aee3538461..a490d9f801aa 100644 +index bb8ff9083e7d..ed6e49bceff1 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h -@@ -560,7 +560,7 @@ struct softirq_action +@@ -569,7 +569,7 @@ struct softirq_action asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); @@ -4978,7 +4832,7 @@ index f9aee3538461..a490d9f801aa 100644 void do_softirq_own_stack(void); #else static inline void do_softirq_own_stack(void) -@@ -654,7 +654,7 @@ enum +@@ -663,26 +663,20 @@ enum TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ }; @@ -4987,16 +4841,21 @@ index f9aee3538461..a490d9f801aa 100644 static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); -@@ -665,15 +665,11 @@ static inline void tasklet_unlock(struct tasklet_struct *t) - smp_mb__before_atomic(); - clear_bit(TASKLET_STATE_RUN, &(t)->state); } + +-static inline void tasklet_unlock(struct tasklet_struct *t) +-{ +- smp_mb__before_atomic(); +- clear_bit(TASKLET_STATE_RUN, &(t)->state); +-} - -static inline void tasklet_unlock_wait(struct tasklet_struct *t) -{ - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } -} ++void tasklet_unlock(struct tasklet_struct *t); +void tasklet_unlock_wait(struct tasklet_struct *t); ++void tasklet_unlock_spin_wait(struct tasklet_struct *t); #else -#define tasklet_trylock(t) 1 -#define tasklet_unlock_wait(t) do { } while (0) @@ -5004,14 +4863,33 @@ index f9aee3538461..a490d9f801aa 100644 +static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } +static inline void tasklet_unlock(struct tasklet_struct *t) { } +static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } ++static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } #endif extern void __tasklet_schedule(struct tasklet_struct *t); +@@ -707,6 +701,17 @@ static inline void tasklet_disable_nosync(struct tasklet_struct *t) + smp_mb__after_atomic(); + } + ++/* ++ * Do not use in new code. There is no real reason to invoke this from ++ * atomic contexts. ++ */ ++static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) ++{ ++ tasklet_disable_nosync(t); ++ tasklet_unlock_spin_wait(t); ++ smp_mb(); ++} ++ + static inline void tasklet_disable(struct tasklet_struct *t) + { + tasklet_disable_nosync(t); diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h -index 30823780c192..f941f2d7d71c 100644 +index ec2a47a81e42..dbbef9089789 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h -@@ -55,4 +55,10 @@ static inline void irq_work_run(void) { } +@@ -64,4 +64,10 @@ static inline void irq_work_run(void) { } static inline void irq_work_single(void *arg) { } #endif @@ -5023,7 +4901,7 @@ index 30823780c192..f941f2d7d71c 100644 + #endif /* _LINUX_IRQ_WORK_H */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h -index 5745491303e0..2b9caf39fb07 100644 +index 891b323266df..45d8bda8fd53 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -68,6 +68,7 @@ struct irq_desc { @@ -5035,7 +4913,7 @@ index 5745491303e0..2b9caf39fb07 100644 struct cpumask *percpu_enabled; const struct cpumask *percpu_affinity; diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h -index 3ed4e8771b64..a437b2e70d37 100644 +index 8de0e1373de7..24dd1f92254c 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -71,14 +71,6 @@ do { \ @@ -5075,11 +4953,23 @@ index 3ed4e8771b64..a437b2e70d37 100644 #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); +diff --git a/include/linux/kcov.h b/include/linux/kcov.h +index 4e3037dc1204..55dc338f6bcd 100644 +--- a/include/linux/kcov.h ++++ b/include/linux/kcov.h +@@ -2,6 +2,7 @@ + #ifndef _LINUX_KCOV_H + #define _LINUX_KCOV_H + ++#include <linux/sched.h> + #include <uapi/linux/kcov.h> + + struct task_struct; diff --git a/include/linux/kernel.h b/include/linux/kernel.h -index c25b8e41c0ea..b3b514a1f12f 100644 +index f7902d8c1048..df23f03b3eb9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h -@@ -218,6 +218,10 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); +@@ -107,6 +107,10 @@ extern void __cant_migrate(const char *file, int line); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) @@ -5090,16 +4980,113 @@ index c25b8e41c0ea..b3b514a1f12f 100644 /** * cant_sleep - annotation for functions that cannot sleep * -@@ -249,6 +253,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); +@@ -150,6 +154,7 @@ extern void __cant_migrate(const char *file, int line); static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define might_sleep_no_state_check() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) + # define cant_migrate() do { } while (0) # define sched_annotate_sleep() do { } while (0) - # define non_block_start() do { } while (0) +diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h +index 3378bcbe585e..86673930c8ea 100644 +--- a/include/linux/kmsg_dump.h ++++ b/include/linux/kmsg_dump.h +@@ -29,6 +29,18 @@ enum kmsg_dump_reason { + KMSG_DUMP_MAX + }; + ++/** ++ * struct kmsg_dumper_iter - iterator for kernel crash message dumper ++ * @active: Flag that specifies if this is currently dumping ++ * @cur_seq: Points to the oldest message to dump (private) ++ * @next_seq: Points after the newest message to dump (private) ++ */ ++struct kmsg_dumper_iter { ++ bool active; ++ u64 cur_seq; ++ u64 next_seq; ++}; ++ + /** + * struct kmsg_dumper - kernel crash message dumper structure + * @list: Entry in the dumper list (private) +@@ -39,33 +51,22 @@ enum kmsg_dump_reason { + */ + struct kmsg_dumper { + struct list_head list; +- void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); ++ void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter); + enum kmsg_dump_reason max_reason; +- bool active; + bool registered; +- +- /* private state of the kmsg iterator */ +- u32 cur_idx; +- u32 next_idx; +- u64 cur_seq; +- u64 next_seq; + }; + + #ifdef CONFIG_PRINTK + void kmsg_dump(enum kmsg_dump_reason reason); + +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, +- char *line, size_t size, size_t *len); +- +-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len); + +-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +- char *buf, size_t size, size_t *len); +- +-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper); ++bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, ++ char *buf, size_t size, size_t *len_out); + +-void kmsg_dump_rewind(struct kmsg_dumper *dumper); ++void kmsg_dump_rewind(struct kmsg_dumper_iter *iter); + + int kmsg_dump_register(struct kmsg_dumper *dumper); + +@@ -77,30 +78,19 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) + { + } + +-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, +- bool syslog, const char *line, +- size_t size, size_t *len) +-{ +- return false; +-} +- +-static inline bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, ++static inline bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + const char *line, size_t size, size_t *len) + { + return false; + } + +-static inline bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, ++static inline bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len) + { + return false; + } + +-static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +-{ +-} +- +-static inline void kmsg_dump_rewind(struct kmsg_dumper *dumper) ++static inline void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + { + } + diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h -index 4a8795b21d77..998d2c34cf0c 100644 +index 4a8795b21d77..271f911f2803 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -7,33 +7,90 @@ @@ -5203,8 +5190,12 @@ index 4a8795b21d77..998d2c34cf0c 100644 static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); -@@ -55,26 +112,55 @@ static inline void local_lock_release(local_lock_t *l) { } +@@ -53,21 +110,50 @@ static inline void local_lock_acquire(local_lock_t *l) { } + static inline void local_lock_release(local_lock_t *l) { } + #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ ++#ifdef CONFIG_PREEMPT_RT ++ #define __local_lock(lock) \ do { \ - preempt_disable(); \ @@ -5217,8 +5208,6 @@ index 4a8795b21d77..998d2c34cf0c 100644 + local_lock_release(this_cpu_ptr(lock)); \ + migrate_enable(); \ + } while (0) -+ -+#ifdef CONFIG_PREEMPT_RT + #define __local_lock_irq(lock) \ do { \ @@ -5232,10 +5221,9 @@ index 4a8795b21d77..998d2c34cf0c 100644 - local_irq_save(flags); \ + migrate_disable(); \ + flags = 0; \ - local_lock_acquire(this_cpu_ptr(lock)); \ - } while (0) - --#define __local_unlock(lock) \ ++ local_lock_acquire(this_cpu_ptr(lock)); \ ++ } while (0) ++ +#define __local_unlock_irq(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ @@ -5243,14 +5231,23 @@ index 4a8795b21d77..998d2c34cf0c 100644 + } while (0) + +#define __local_unlock_irqrestore(lock, flags) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ -- preempt_enable(); \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ + migrate_enable(); \ + } while (0) + +#else + ++#define __local_lock(lock) \ ++ do { \ ++ preempt_disable(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +@@ -77,6 +163,18 @@ static inline void local_lock_release(local_lock_t *l) { } + preempt_enable(); \ + } while (0) + +#define __local_lock_irq(lock) \ + do { \ + local_irq_disable(); \ @@ -5261,33 +5258,19 @@ index 4a8795b21d77..998d2c34cf0c 100644 + do { \ + local_irq_save(flags); \ + local_lock_acquire(this_cpu_ptr(lock)); \ - } while (0) - ++ } while (0) ++ #define __local_unlock_irq(lock) \ -@@ -88,3 +174,5 @@ static inline void local_lock_release(local_lock_t *l) { } + do { \ + local_lock_release(this_cpu_ptr(lock)); \ +@@ -88,3 +186,5 @@ static inline void local_lock_release(local_lock_t *l) { } local_lock_release(this_cpu_ptr(lock)); \ local_irq_restore(flags); \ } while (0) + +#endif -diff --git a/include/linux/mhi.h b/include/linux/mhi.h -index c4a940d98912..2b4ed30d4729 100644 ---- a/include/linux/mhi.h -+++ b/include/linux/mhi.h -@@ -9,10 +9,9 @@ - #include <linux/device.h> - #include <linux/dma-direction.h> - #include <linux/mutex.h> --#include <linux/rwlock_types.h> - #include <linux/skbuff.h> - #include <linux/slab.h> --#include <linux/spinlock_types.h> -+#include <linux/spinlock.h> - #include <linux/wait.h> - #include <linux/workqueue.h> - diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index ed028af3cb19..cbdc39fea2ff 100644 +index 2f395ab624f3..8492582d486c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ @@ -5297,8 +5280,8 @@ index ed028af3cb19..cbdc39fea2ff 100644 +#include <linux/rcupdate.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> - -@@ -548,6 +549,9 @@ struct mm_struct { + #include <linux/seqlock.h> +@@ -555,6 +556,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; @@ -5374,10 +5357,10 @@ index dcd185cbfe79..90f090efcb58 100644 #endif /* __LINUX_MUTEX_H */ diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h new file mode 100644 -index 000000000000..7179367bfb5e +index 000000000000..f0b2e07cd5c5 --- /dev/null +++ b/include/linux/mutex_rt.h -@@ -0,0 +1,131 @@ +@@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_MUTEX_RT_H +#define __LINUX_MUTEX_RT_H @@ -5409,7 +5392,6 @@ index 000000000000..7179367bfb5e + +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key); +extern void __lockfunc _mutex_lock(struct mutex *lock); -+extern void __lockfunc _mutex_lock_io(struct mutex *lock); +extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); +extern int __lockfunc _mutex_lock_killable(struct mutex *lock); @@ -5426,7 +5408,7 @@ index 000000000000..7179367bfb5e +#define mutex_lock_killable(l) _mutex_lock_killable(l) +#define mutex_trylock(l) _mutex_trylock(l) +#define mutex_unlock(l) _mutex_unlock(l) -+#define mutex_lock_io(l) _mutex_lock_io(l); ++#define mutex_lock_io(l) _mutex_lock_io_nested(l, 0); + +#define __mutex_owner(l) ((l)->lock.owner) + @@ -5457,7 +5439,7 @@ index 000000000000..7179367bfb5e +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable(l) +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) -+# define mutex_lock_io_nested(l, s) _mutex_lock_io(l) ++# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) +#endif + +# define mutex_init(mutex) \ @@ -5510,10 +5492,10 @@ index 000000000000..7179367bfb5e + +#endif diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h -index 69cb46f7b8d2..fc682adab66a 100644 +index 3327239fa2f9..f596a16a5f7b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h -@@ -1670,7 +1670,7 @@ struct nfs_unlinkdata { +@@ -1684,7 +1684,7 @@ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; @@ -5522,8 +5504,39 @@ index 69cb46f7b8d2..fc682adab66a 100644 const struct cred *cred; struct nfs_fattr dir_attr; long timeout; +diff --git a/include/linux/notifier.h b/include/linux/notifier.h +index 2fb373a5c1ed..723bc2df6388 100644 +--- a/include/linux/notifier.h ++++ b/include/linux/notifier.h +@@ -58,7 +58,7 @@ struct notifier_block { + }; + + struct atomic_notifier_head { +- spinlock_t lock; ++ raw_spinlock_t lock; + struct notifier_block __rcu *head; + }; + +@@ -78,7 +78,7 @@ struct srcu_notifier_head { + }; + + #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ +- spin_lock_init(&(name)->lock); \ ++ raw_spin_lock_init(&(name)->lock); \ + (name)->head = NULL; \ + } while (0) + #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ +@@ -95,7 +95,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); + cleanup_srcu_struct(&(name)->srcu); + + #define ATOMIC_NOTIFIER_INIT(name) { \ +- .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ ++ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .head = NULL } + #define BLOCKING_NOTIFIER_INIT(name) { \ + .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ diff --git a/include/linux/pid.h b/include/linux/pid.h -index 176d6cf80e7c..4daecc34c097 100644 +index fa10acb8d6a4..2f86f84e9fc1 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ @@ -5535,46 +5548,23 @@ index 176d6cf80e7c..4daecc34c097 100644 #include <linux/refcount.h> diff --git a/include/linux/preempt.h b/include/linux/preempt.h -index 7d9c1c0e149c..8a47b9b1bade 100644 +index 69cc8b64aa3a..af39859f02ee 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h -@@ -77,10 +77,14 @@ - /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ - #include <asm/preempt.h> +@@ -79,7 +79,11 @@ --#define hardirq_count() (preempt_count() & HARDIRQ_MASK) + #define nmi_count() (preempt_count() & NMI_MASK) + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) -#define softirq_count() (preempt_count() & SOFTIRQ_MASK) --#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ -- | NMI_MASK)) -+#define pc_nmi_count() (preempt_count() & NMI_MASK) -+#define hardirq_count() (preempt_count() & HARDIRQ_MASK) +#ifdef CONFIG_PREEMPT_RT +# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) +#else +# define softirq_count() (preempt_count() & SOFTIRQ_MASK) +#endif -+#define irq_count() (pc_nmi_count() | hardirq_count() | softirq_count()) - - /* - * Are we doing bottom half or hardware interrupt processing? -@@ -95,13 +99,12 @@ - * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really - * should not be used in new code. - */ -+#define in_nmi() (pc_nmi_count()) - #define in_irq() (hardirq_count()) --#define in_softirq() (softirq_count()) - #define in_interrupt() (irq_count()) -+#define in_softirq() (softirq_count()) - #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) --#define in_nmi() (preempt_count() & NMI_MASK) --#define in_task() (!(preempt_count() & \ -- (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) -+#define in_task() (!(irq_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + #define irq_count() (nmi_count() | hardirq_count() | softirq_count()) /* - * The preempt_count offset after preempt_disable(); -@@ -115,7 +118,11 @@ +@@ -117,7 +121,11 @@ /* * The preempt_count offset after spin_lock() */ @@ -5586,7 +5576,7 @@ index 7d9c1c0e149c..8a47b9b1bade 100644 /* * The preempt_count offset needed for things like: -@@ -164,6 +171,20 @@ extern void preempt_count_sub(int val); +@@ -166,6 +174,20 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) @@ -5607,7 +5597,7 @@ index 7d9c1c0e149c..8a47b9b1bade 100644 #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ -@@ -172,13 +193,25 @@ do { \ +@@ -174,13 +196,25 @@ do { \ barrier(); \ } while (0) @@ -5634,7 +5624,7 @@ index 7d9c1c0e149c..8a47b9b1bade 100644 #define preemptible() (preempt_count() == 0 && !irqs_disabled()) -@@ -203,6 +236,18 @@ do { \ +@@ -205,6 +239,18 @@ do { \ __preempt_schedule(); \ } while (0) @@ -5653,7 +5643,7 @@ index 7d9c1c0e149c..8a47b9b1bade 100644 #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ -@@ -210,6 +255,12 @@ do { \ +@@ -212,6 +258,12 @@ do { \ preempt_count_dec(); \ } while (0) @@ -5666,15 +5656,20 @@ index 7d9c1c0e149c..8a47b9b1bade 100644 #define preempt_enable_notrace() \ do { \ barrier(); \ -@@ -248,6 +299,7 @@ do { \ +@@ -250,8 +302,12 @@ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() +#define preempt_check_resched_rt() barrier() #define preemptible() 0 ++#define preempt_lazy_disable() barrier() ++#define preempt_lazy_enable() barrier() ++ #endif /* CONFIG_PREEMPT_COUNT */ -@@ -268,10 +320,22 @@ do { \ + + #ifdef MODULE +@@ -270,10 +326,22 @@ do { \ } while (0) #define preempt_fold_need_resched() \ do { \ @@ -5698,72 +5693,12 @@ index 7d9c1c0e149c..8a47b9b1bade 100644 #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; -@@ -322,6 +386,80 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, +@@ -386,8 +454,15 @@ extern void migrate_enable(void); - #endif + #else -+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) -+ -+/* -+ * Migrate-Disable and why it is undesired. -+ * -+ * When a preempted task becomes elegible to run under the ideal model (IOW it -+ * becomes one of the M highest priority tasks), it might still have to wait -+ * for the preemptee's migrate_disable() section to complete. Thereby suffering -+ * a reduction in bandwidth in the exact duration of the migrate_disable() -+ * section. -+ * -+ * Per this argument, the change from preempt_disable() to migrate_disable() -+ * gets us: -+ * -+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() -+ * it would have had to wait for the lower priority task. -+ * -+ * - a lower priority tasks; which under preempt_disable() could've instantly -+ * migrated away when another CPU becomes available, is now constrained -+ * by the ability to push the higher priority task away, which might itself be -+ * in a migrate_disable() section, reducing it's available bandwidth. -+ * -+ * IOW it trades latency / moves the interference term, but it stays in the -+ * system, and as long as it remains unbounded, the system is not fully -+ * deterministic. -+ * -+ * -+ * The reason we have it anyway. -+ * -+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a -+ * number of primitives into becoming preemptible, they would also allow -+ * migration. This turns out to break a bunch of per-cpu usage. To this end, -+ * all these primitives employ migirate_disable() to restore this implicit -+ * assumption. -+ * -+ * This is a 'temporary' work-around at best. The correct solution is getting -+ * rid of the above assumptions and reworking the code to employ explicit -+ * per-cpu locking or short preempt-disable regions. -+ * -+ * The end goal must be to get rid of migrate_disable(), alternatively we need -+ * a schedulability theory that does not depend on abritrary migration. -+ * -+ * -+ * Notes on the implementation. -+ * -+ * The implementation is particularly tricky since existing code patterns -+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block. -+ * This means that it cannot use cpus_read_lock() to serialize against hotplug, -+ * nor can it easily migrate itself into a pending affinity mask change on -+ * migrate_enable(). -+ * -+ * -+ * Note: even non-work-conserving schedulers like semi-partitioned depends on -+ * migration, so migrate_disable() is not only a problem for -+ * work-conserving schedulers. -+ * -+ */ -+extern void migrate_disable(void); -+extern void migrate_enable(void); -+ -+#elif defined(CONFIG_PREEMPT_RT) -+ +-static inline void migrate_disable(void) { } +-static inline void migrate_enable(void) { } +static inline void migrate_disable(void) +{ + preempt_lazy_disable(); @@ -5773,29 +5708,32 @@ index 7d9c1c0e149c..8a47b9b1bade 100644 +{ + preempt_lazy_enable(); +} -+ -+#else /* !CONFIG_PREEMPT_RT */ -+ - /** - * migrate_disable - Prevent migration of the current task - * -@@ -352,4 +490,6 @@ static __always_inline void migrate_enable(void) - preempt_enable(); - } -+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ -+ - #endif /* __LINUX_PREEMPT_H */ + #endif /* CONFIG_SMP */ + diff --git a/include/linux/printk.h b/include/linux/printk.h -index 34c1a7be3e01..c49d5bb3f8ff 100644 +index fe7eb2351610..7e4352467d83 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h -@@ -147,22 +147,12 @@ static inline __printf(1, 2) __cold - void early_printk(const char *s, ...) { } - #endif +@@ -46,6 +46,12 @@ static inline const char *printk_skip_headers(const char *buffer) --#ifdef CONFIG_PRINTK_NMI --extern void printk_nmi_enter(void); + #define CONSOLE_EXT_LOG_MAX 8192 + ++/* ++ * The maximum size of a record formatted for console printing ++ * (i.e. with the prefix prepended to every line). ++ */ ++#define CONSOLE_LOG_MAX 4096 ++ + /* printk's without a loglevel use this.. */ + #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT + +@@ -149,18 +155,6 @@ static inline __printf(1, 2) __cold + void early_printk(const char *s, ...) { } + #endif + +-#ifdef CONFIG_PRINTK_NMI +-extern void printk_nmi_enter(void); -extern void printk_nmi_exit(void); -extern void printk_nmi_direct_enter(void); -extern void printk_nmi_direct_exit(void); @@ -5805,18 +5743,11 @@ index 34c1a7be3e01..c49d5bb3f8ff 100644 -static inline void printk_nmi_direct_enter(void) { } -static inline void printk_nmi_direct_exit(void) { } -#endif /* PRINTK_NMI */ -+struct dev_printk_info; +- + struct dev_printk_info; #ifdef CONFIG_PRINTK --asmlinkage __printf(5, 0) -+asmlinkage __printf(4, 0) - int vprintk_emit(int facility, int level, -- const char *dict, size_t dictlen, -+ const struct dev_printk_info *dev_info, - const char *fmt, va_list args); - - asmlinkage __printf(1, 0) -@@ -203,8 +193,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); +@@ -207,8 +201,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack(void) __cold; @@ -5825,7 +5756,7 @@ index 34c1a7be3e01..c49d5bb3f8ff 100644 #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) -@@ -268,14 +256,6 @@ static inline void show_regs_print_info(const char *log_lvl) +@@ -272,14 +264,6 @@ static inline void show_regs_print_info(const char *log_lvl) static inline void dump_stack(void) { } @@ -5840,6 +5771,15 @@ index 34c1a7be3e01..c49d5bb3f8ff 100644 #endif extern int kptr_restrict; +@@ -497,6 +481,8 @@ extern int kptr_restrict; + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) + #endif + ++bool pr_flush(int timeout_ms, bool reset_on_progress); ++ + /* + * ratelimited messages with local ratelimit_state, + * no local ratelimit_state used in the !PRINTK case diff --git a/include/linux/random.h b/include/linux/random.h index f45b8be3e3c4..0e41d0527809 100644 --- a/include/linux/random.h @@ -5900,30 +5840,6 @@ index d7db17996322..c33b0e16d04b 100644 #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } /* Same as rb_first(), but O(1) */ -diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h -index 7d012faa509a..3d1a9e716b80 100644 ---- a/include/linux/rbtree_latch.h -+++ b/include/linux/rbtree_latch.h -@@ -42,8 +42,8 @@ struct latch_tree_node { - }; - - struct latch_tree_root { -- seqcount_t seq; -- struct rb_root tree[2]; -+ seqcount_latch_t seq; -+ struct rb_root tree[2]; - }; - - /** -@@ -206,7 +206,7 @@ latch_tree_find(void *key, struct latch_tree_root *root, - do { - seq = raw_read_seqcount_latch(&root->seq); - node = __lt_find(key, root, seq & 1, ops->comp); -- } while (read_seqcount_retry(&root->seq, seq)); -+ } while (read_seqcount_latch_retry(&root->seq, seq)); - - return node; - } diff --git a/include/linux/rbtree_type.h b/include/linux/rbtree_type.h new file mode 100644 index 000000000000..77a89dd2c7c6 @@ -5962,7 +5878,7 @@ index 000000000000..77a89dd2c7c6 + +#endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h -index d15d46db61f7..76d19f339419 100644 +index fd02c5fa60cb..8b06b9b16111 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,6 +52,11 @@ void __rcu_read_unlock(void); @@ -5977,7 +5893,7 @@ index d15d46db61f7..76d19f339419 100644 #else /* #ifdef CONFIG_PREEMPT_RCU */ -@@ -70,6 +75,8 @@ static inline int rcu_preempt_depth(void) +@@ -77,6 +82,8 @@ static inline int rcu_preempt_depth(void) return 0; } @@ -5986,7 +5902,7 @@ index d15d46db61f7..76d19f339419 100644 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ -@@ -312,7 +319,8 @@ static inline void rcu_preempt_sleep_check(void) { } +@@ -326,7 +333,8 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ @@ -5997,7 +5913,7 @@ index d15d46db61f7..76d19f339419 100644 RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h -index 6fd615a0eea9..5308cd7ddddf 100644 +index 6fd615a0eea9..b02009f53026 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -14,11 +14,15 @@ @@ -6091,14 +6007,13 @@ index 6fd615a0eea9..5308cd7ddddf 100644 #define DEFINE_RT_MUTEX(mutexname) \ struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) -@@ -115,9 +112,7 @@ extern void rt_mutex_lock(struct rt_mutex *lock); +@@ -115,9 +112,6 @@ extern void rt_mutex_lock(struct rt_mutex *lock); #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); -extern int rt_mutex_timed_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout); - -+extern int rt_mutex_lock_killable(struct rt_mutex *lock); extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); @@ -6296,10 +6211,10 @@ index 000000000000..4762391d659b +#endif diff --git a/include/linux/rwsem-rt.h b/include/linux/rwsem-rt.h new file mode 100644 -index 000000000000..7f7e748ef522 +index 000000000000..0ba8aae9a198 --- /dev/null +++ b/include/linux/rwsem-rt.h -@@ -0,0 +1,69 @@ +@@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _LINUX_RWSEM_RT_H +#define _LINUX_RWSEM_RT_H @@ -6359,6 +6274,7 @@ index 000000000000..7f7e748ef522 +} + +extern void __down_read(struct rw_semaphore *sem); ++extern int __down_read_interruptible(struct rw_semaphore *sem); +extern int __down_read_killable(struct rw_semaphore *sem); +extern int __down_read_trylock(struct rw_semaphore *sem); +extern void __down_write(struct rw_semaphore *sem); @@ -6370,7 +6286,7 @@ index 000000000000..7f7e748ef522 + +#endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h -index 25e3fde85617..9101367852bc 100644 +index 4c715be48717..9323af8a9244 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,6 +16,11 @@ @@ -6400,18 +6316,18 @@ index 25e3fde85617..9101367852bc 100644 * lock for reading */ diff --git a/include/linux/sched.h b/include/linux/sched.h -index afe01e232935..c72ae6627e96 100644 +index 6e3a5eeec509..183e9d90841c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -34,6 +34,7 @@ - #include <linux/rseq.h> - #include <linux/seqlock.h> - #include <linux/kcsan.h> -+#include <asm/kmap_types.h> - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -110,12 +111,8 @@ struct task_group; +@@ -14,7 +14,6 @@ + #include <linux/pid.h> + #include <linux/sem.h> + #include <linux/shm.h> +-#include <linux/kcov.h> + #include <linux/mutex.h> + #include <linux/plist.h> + #include <linux/hrtimer.h> +@@ -113,12 +112,8 @@ struct io_uring_task; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) @@ -6424,7 +6340,7 @@ index afe01e232935..c72ae6627e96 100644 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP /* -@@ -139,6 +136,9 @@ struct task_group; +@@ -142,6 +137,9 @@ struct io_uring_task; smp_store_mb(current->state, (state_value)); \ } while (0) @@ -6434,7 +6350,7 @@ index afe01e232935..c72ae6627e96 100644 #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ -@@ -192,6 +192,9 @@ struct task_group; +@@ -195,6 +193,9 @@ struct io_uring_task; #define set_current_state(state_value) \ smp_store_mb(current->state, (state_value)) @@ -6444,7 +6360,7 @@ index afe01e232935..c72ae6627e96 100644 /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must -@@ -638,6 +641,8 @@ struct task_struct { +@@ -656,6 +657,8 @@ struct task_struct { #endif /* -1 unrunnable, 0 runnable, >0 stopped: */ volatile long state; @@ -6453,19 +6369,7 @@ index afe01e232935..c72ae6627e96 100644 /* * This begins the randomizable portion of task_struct. Only -@@ -713,6 +718,11 @@ struct task_struct { - int nr_cpus_allowed; - const cpumask_t *cpus_ptr; - cpumask_t cpus_mask; -+ void *migration_pending; -+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) -+ unsigned short migration_disabled; -+#endif -+ unsigned short migration_flags; - - #ifdef CONFIG_PREEMPT_RCU - int rcu_read_lock_nesting; -@@ -941,11 +951,16 @@ struct task_struct { +@@ -982,11 +985,16 @@ struct task_struct { /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; @@ -6482,7 +6386,7 @@ index afe01e232935..c72ae6627e96 100644 unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; -@@ -972,6 +987,7 @@ struct task_struct { +@@ -1014,6 +1022,7 @@ struct task_struct { raw_spinlock_t pi_lock; struct wake_q_node wake_q; @@ -6490,7 +6394,7 @@ index afe01e232935..c72ae6627e96 100644 #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ -@@ -999,6 +1015,9 @@ struct task_struct { +@@ -1041,6 +1050,9 @@ struct task_struct { int softirq_context; int irq_config; #endif @@ -6500,20 +6404,7 @@ index afe01e232935..c72ae6627e96 100644 #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL -@@ -1280,6 +1299,12 @@ struct task_struct { - unsigned int sequential_io; - unsigned int sequential_io_avg; - #endif -+#ifdef CONFIG_PREEMPT_RT -+# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32 -+ int kmap_idx; -+ pte_t kmap_pte[KM_TYPE_NR]; -+# endif -+#endif - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - unsigned long task_state_change; - #endif -@@ -1722,6 +1747,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); +@@ -1775,6 +1787,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); @@ -6521,7 +6412,7 @@ index afe01e232935..c72ae6627e96 100644 extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP -@@ -1812,6 +1838,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) +@@ -1865,6 +1878,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } @@ -6611,23 +6502,8 @@ index afe01e232935..c72ae6627e96 100644 /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return -diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h -index 9a62ffdd296f..412cdaba33eb 100644 ---- a/include/linux/sched/hotplug.h -+++ b/include/linux/sched/hotplug.h -@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu); - extern int sched_cpu_deactivate(unsigned int cpu); - - #ifdef CONFIG_HOTPLUG_CPU -+extern int sched_cpu_wait_empty(unsigned int cpu); - extern int sched_cpu_dying(unsigned int cpu); - #else -+# define sched_cpu_wait_empty NULL - # define sched_cpu_dying NULL - #endif - diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h -index f889e332912f..1a08d3d41805 100644 +index 1ae08b8462a4..4c74089aea20 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm) @@ -6642,12 +6518,12 @@ index f889e332912f..1a08d3d41805 100644 + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else -+# define mmdrop_delayed(mm) mmdrop(mm) ++# define mmdrop_delayed(mm) mmdrop(mm) +#endif + - /* - * This has to be called after a get_task_mm()/mmget_not_zero() - * followed by taking the mmap_lock for writing before modifying the + /** + * mmget() - Pin the address space associated with a &struct mm_struct. + * @mm: The address space to pin. diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index e5af028c08b4..994c25640e15 100644 --- a/include/linux/sched/rt.h @@ -6696,868 +6572,149 @@ index 26a2013ac39c..6e2dff721547 100644 +} #endif /* _LINUX_SCHED_WAKE_Q_H */ -diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h -index 962d9768945f..f73c7eb68f27 100644 ---- a/include/linux/seqlock.h -+++ b/include/linux/seqlock.h -@@ -17,6 +17,7 @@ - #include <linux/kcsan-checks.h> - #include <linux/lockdep.h> - #include <linux/mutex.h> -+#include <linux/ww_mutex.h> - #include <linux/preempt.h> - #include <linux/spinlock.h> +diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h +index 9e655055112d..ffef674deda7 100644 +--- a/include/linux/serial_8250.h ++++ b/include/linux/serial_8250.h +@@ -7,6 +7,7 @@ + #ifndef _LINUX_SERIAL_8250_H + #define _LINUX_SERIAL_8250_H -@@ -53,7 +54,7 @@ - * - * If the write serialization mechanism is one of the common kernel - * locking primitives, use a sequence counter with associated lock -- * (seqcount_LOCKTYPE_t) instead. -+ * (seqcount_LOCKNAME_t) instead. - * - * If it's desired to automatically handle the sequence counter writer - * serialization and non-preemptibility requirements, use a sequential -@@ -117,7 +118,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) - #define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } ++#include <linux/atomic.h> + #include <linux/serial_core.h> + #include <linux/serial_reg.h> + #include <linux/platform_device.h> +@@ -125,6 +126,8 @@ struct uart_8250_port { + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA + unsigned char msr_saved_flags; - /* -- * Sequence counters with associated locks (seqcount_LOCKTYPE_t) -+ * Sequence counters with associated locks (seqcount_LOCKNAME_t) - * - * A sequence counter which associates the lock used for writer - * serialization at initialization time. This enables lockdep to validate -@@ -131,37 +132,59 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) - * See Documentation/locking/seqlock.rst - */ ++ atomic_t console_printing; ++ + struct uart_8250_dma *dma; + const struct uart_8250_ops *ops; --#ifdef CONFIG_LOCKDEP -+/* -+ * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot -+ * disable preemption. It can lead to higher latencies, and the write side -+ * sections will not be able to acquire locks which become sleeping locks -+ * (e.g. spinlock_t). -+ * -+ * To remain preemptible while avoiding a possible livelock caused by the -+ * reader preempting the writer, use a different technique: let the reader -+ * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the -+ * case, acquire then release the associated LOCKNAME writer serialization -+ * lock. This will allow any possibly-preempted writer to make progress -+ * until the end of its writer serialization lock critical section. -+ * -+ * This lock-unlock technique must be implemented for all of PREEMPT_RT -+ * sleeping locks. See Documentation/locking/locktypes.rst -+ */ -+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) - #define __SEQ_LOCK(expr) expr - #else - #define __SEQ_LOCK(expr) - #endif +@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); + void serial8250_set_defaults(struct uart_8250_port *up); + void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); ++void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, ++ unsigned int count); + int serial8250_console_setup(struct uart_port *port, char *options, bool probe); + int serial8250_console_exit(struct uart_port *port); - /** -- * typedef seqcount_LOCKNAME_t - sequence counter with LOCKTYPR associated -+ * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated - * @seqcount: The real sequence counter -- * @lock: Pointer to the associated spinlock -+ * @lock: Pointer to the associated lock - * -- * A plain sequence counter with external writer synchronization by a -- * spinlock. The spinlock is associated to the sequence count in the -+ * A plain sequence counter with external writer synchronization by -+ * LOCKNAME @lock. The lock is associated to the sequence counter in the - * static initializer or init function. This enables lockdep to validate - * that the write side critical section is properly serialized. -+ * -+ * LOCKNAME: raw_spinlock, spinlock, rwlock, mutex, or ww_mutex. - */ +diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h +index d82b6f396588..12b2e41d8f47 100644 +--- a/include/linux/shmem_fs.h ++++ b/include/linux/shmem_fs.h +@@ -31,7 +31,7 @@ struct shmem_sb_info { + struct percpu_counter used_blocks; /* How many are allocated */ + unsigned long max_inodes; /* How many inodes are allowed */ + unsigned long free_inodes; /* How many are left for allocation */ +- spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ ++ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + umode_t mode; /* Mount mode for root directory */ + unsigned char huge; /* Whether to try for hugepages */ + kuid_t uid; /* Mount uid for root directory */ +diff --git a/include/linux/signal.h b/include/linux/signal.h +index 205526c4003a..d47a86790edc 100644 +--- a/include/linux/signal.h ++++ b/include/linux/signal.h +@@ -265,6 +265,7 @@ static inline void init_sigpending(struct sigpending *sig) + } --/** -+/* - * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t - * @s: Pointer to the seqcount_LOCKNAME_t instance -- * @lock: Pointer to the associated LOCKTYPE -+ * @lock: Pointer to the associated lock - */ + extern void flush_sigqueue(struct sigpending *queue); ++extern void flush_task_sigqueue(struct task_struct *tsk); - /* -- * SEQCOUNT_LOCKTYPE() - Instantiate seqcount_LOCKNAME_t and helpers -- * @locktype: actual typename -- * @lockname: name -+ * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers -+ * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t -+ * -+ * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t -+ * @locktype: LOCKNAME canonical C data type - * @preemptible: preemptibility of above locktype - * @lockmember: argument for lockdep_assert_held() -+ * @lockbase: associated lock release function (prefix only) -+ * @lock_acquire: associated lock acquisition function (full call) - */ --#define SEQCOUNT_LOCKTYPE(locktype, lockname, preemptible, lockmember) \ -+#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \ - typedef struct seqcount_##lockname { \ - seqcount_t seqcount; \ - __SEQ_LOCK(locktype *lock); \ -@@ -175,19 +198,45 @@ seqcount_##lockname##_init(seqcount_##lockname##_t *s, locktype *lock) \ - } \ - \ - static __always_inline seqcount_t * \ --__seqcount_##lockname##_ptr(seqcount_##lockname##_t *s) \ -+__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ - { \ - return &s->seqcount; \ - } \ - \ -+static __always_inline unsigned \ -+__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ -+{ \ -+ unsigned seq = READ_ONCE(s->seqcount.sequence); \ -+ \ -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ -+ return seq; \ -+ \ -+ if (preemptible && unlikely(seq & 1)) { \ -+ __SEQ_LOCK(lock_acquire); \ -+ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ -+ \ -+ /* \ -+ * Re-read the sequence counter since the (possibly \ -+ * preempted) writer made progress. \ -+ */ \ -+ seq = READ_ONCE(s->seqcount.sequence); \ -+ } \ -+ \ -+ return seq; \ -+} \ -+ \ - static __always_inline bool \ --__seqcount_##lockname##_preemptible(seqcount_##lockname##_t *s) \ -+__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ - { \ -- return preemptible; \ -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ -+ return preemptible; \ -+ \ -+ /* PREEMPT_RT relies on the above LOCK+UNLOCK */ \ -+ return false; \ - } \ - \ - static __always_inline void \ --__seqcount_##lockname##_assert(seqcount_##lockname##_t *s) \ -+__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ - { \ - __SEQ_LOCK(lockdep_assert_held(lockmember)); \ - } -@@ -196,50 +245,56 @@ __seqcount_##lockname##_assert(seqcount_##lockname##_t *s) \ - * __seqprop() for seqcount_t - */ + /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ + static inline int valid_signal(unsigned long sig) +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 5f60c9e907c9..7800b6c2115e 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -295,6 +295,7 @@ struct sk_buff_head { --static inline seqcount_t *__seqcount_ptr(seqcount_t *s) -+static inline seqcount_t *__seqprop_ptr(seqcount_t *s) - { - return s; + __u32 qlen; + spinlock_t lock; ++ raw_spinlock_t raw_lock; + }; + + struct sk_buff; +@@ -1890,6 +1891,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) + __skb_queue_head_init(list); } --static inline bool __seqcount_preemptible(seqcount_t *s) -+static inline unsigned __seqprop_sequence(const seqcount_t *s) ++static inline void skb_queue_head_init_raw(struct sk_buff_head *list) +{ -+ return READ_ONCE(s->sequence); ++ raw_spin_lock_init(&list->raw_lock); ++ __skb_queue_head_init(list); +} + -+static inline bool __seqprop_preemptible(const seqcount_t *s) - { - return false; - } - --static inline void __seqcount_assert(seqcount_t *s) -+static inline void __seqprop_assert(const seqcount_t *s) + static inline void skb_queue_head_init_class(struct sk_buff_head *list, + struct lock_class_key *class) { - lockdep_assert_preemption_disabled(); - } - --SEQCOUNT_LOCKTYPE(raw_spinlock_t, raw_spinlock, false, s->lock) --SEQCOUNT_LOCKTYPE(spinlock_t, spinlock, false, s->lock) --SEQCOUNT_LOCKTYPE(rwlock_t, rwlock, false, s->lock) --SEQCOUNT_LOCKTYPE(struct mutex, mutex, true, s->lock) --SEQCOUNT_LOCKTYPE(struct ww_mutex, ww_mutex, true, &s->lock->base) -+#define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) +diff --git a/include/linux/smp.h b/include/linux/smp.h +index 70c6f6284dcf..4c602ca3bc13 100644 +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -238,6 +238,9 @@ static inline int get_boot_cpu_id(void) + #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) + #define put_cpu() preempt_enable() --/** -+SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock, raw_spin, raw_spin_lock(s->lock)) -+SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, s->lock, spin, spin_lock(s->lock)) -+SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, s->lock, read, read_lock(s->lock)) -+SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex, mutex_lock(s->lock)) -+SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mutex, ww_mutex_lock(s->lock, NULL)) ++#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) ++#define put_cpu_light() migrate_enable() + -+/* - * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t - * @name: Name of the seqcount_LOCKNAME_t instance -- * @lock: Pointer to the associated LOCKTYPE -+ * @lock: Pointer to the associated LOCKNAME - */ - --#define SEQCOUNT_LOCKTYPE_ZERO(seq_name, assoc_lock) { \ -+#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \ - .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ - __SEQ_LOCK(.lock = (assoc_lock)) \ - } - --#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) --#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) --#define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) --#define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) --#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKTYPE_ZERO(name, lock) -- -+#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) -+#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) -+#define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) -+#define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) -+#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) - - #define __seqprop_case(s, lockname, prop) \ -- seqcount_##lockname##_t: __seqcount_##lockname##_##prop((void *)(s)) -+ seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s)) - - #define __seqprop(s, prop) _Generic(*(s), \ -- seqcount_t: __seqcount_##prop((void *)(s)), \ -+ seqcount_t: __seqprop_##prop((void *)(s)), \ - __seqprop_case((s), raw_spinlock, prop), \ - __seqprop_case((s), spinlock, prop), \ - __seqprop_case((s), rwlock, prop), \ -@@ -247,12 +302,13 @@ SEQCOUNT_LOCKTYPE(struct ww_mutex, ww_mutex, true, &s->lock->base) - __seqprop_case((s), ww_mutex, prop)) - - #define __seqcount_ptr(s) __seqprop(s, ptr) -+#define __seqcount_sequence(s) __seqprop(s, sequence) - #define __seqcount_lock_preemptible(s) __seqprop(s, preemptible) - #define __seqcount_assert_lock_held(s) __seqprop(s, assert) + /* + * Callback to arch code if there's nosmp or maxcpus=0 on the + * boot command line: +diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h +index 79897841a2cc..c3c70291b46c 100644 +--- a/include/linux/spinlock.h ++++ b/include/linux/spinlock.h +@@ -309,7 +309,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) + }) - /** - * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() - * barrier. Callers should ensure that smp_rmb() or equivalent ordering is -@@ -265,56 +321,45 @@ SEQCOUNT_LOCKTYPE(struct ww_mutex, ww_mutex, true, &s->lock->base) - * Return: count to be passed to read_seqcount_retry() - */ - #define __read_seqcount_begin(s) \ -- __read_seqcount_t_begin(__seqcount_ptr(s)) -- --static inline unsigned __read_seqcount_t_begin(const seqcount_t *s) --{ -- unsigned ret; -- --repeat: -- ret = READ_ONCE(s->sequence); -- if (unlikely(ret & 1)) { -- cpu_relax(); -- goto repeat; -- } -- kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); -- return ret; --} -+({ \ -+ unsigned seq; \ -+ \ -+ while ((seq = __seqcount_sequence(s)) & 1) \ -+ cpu_relax(); \ -+ \ -+ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ -+ seq; \ -+}) + /* Include rwlock functions */ +-#include <linux/rwlock.h> ++#ifdef CONFIG_PREEMPT_RT ++# include <linux/rwlock_rt.h> ++#else ++# include <linux/rwlock.h> ++#endif - /** - * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * Return: count to be passed to read_seqcount_retry() - */ - #define raw_read_seqcount_begin(s) \ -- raw_read_seqcount_t_begin(__seqcount_ptr(s)) -- --static inline unsigned raw_read_seqcount_t_begin(const seqcount_t *s) --{ -- unsigned ret = __read_seqcount_t_begin(s); -- smp_rmb(); -- return ret; --} -+({ \ -+ unsigned seq = __read_seqcount_begin(s); \ -+ \ -+ smp_rmb(); \ -+ seq; \ -+}) + /* + * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: +@@ -320,6 +324,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) + # include <linux/spinlock_api_up.h> + #endif - /** - * read_seqcount_begin() - begin a seqcount_t read critical section -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * Return: count to be passed to read_seqcount_retry() ++#ifdef CONFIG_PREEMPT_RT ++# include <linux/spinlock_rt.h> ++#else /* PREEMPT_RT */ ++ + /* + * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ - #define read_seqcount_begin(s) \ -- read_seqcount_t_begin(__seqcount_ptr(s)) -- --static inline unsigned read_seqcount_t_begin(const seqcount_t *s) --{ -- seqcount_lockdep_reader_access(s); -- return raw_read_seqcount_t_begin(s); --} -+({ \ -+ seqcount_lockdep_reader_access(__seqcount_ptr(s)); \ -+ raw_read_seqcount_begin(s); \ -+}) +@@ -454,6 +462,8 @@ static __always_inline int spin_is_contended(spinlock_t *lock) - /** - * raw_read_seqcount() - read the raw seqcount_t counter value -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * raw_read_seqcount opens a read critical section of the given - * seqcount_t, without any lockdep checking, and without checking or -@@ -324,20 +369,18 @@ static inline unsigned read_seqcount_t_begin(const seqcount_t *s) - * Return: count to be passed to read_seqcount_retry() - */ - #define raw_read_seqcount(s) \ -- raw_read_seqcount_t(__seqcount_ptr(s)) -- --static inline unsigned raw_read_seqcount_t(const seqcount_t *s) --{ -- unsigned ret = READ_ONCE(s->sequence); -- smp_rmb(); -- kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); -- return ret; --} -+({ \ -+ unsigned seq = __seqcount_sequence(s); \ -+ \ -+ smp_rmb(); \ -+ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ -+ seq; \ -+}) + #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) - /** - * raw_seqcount_begin() - begin a seqcount_t read critical section w/o - * lockdep and w/o counter stabilization -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * raw_seqcount_begin opens a read critical section of the given - * seqcount_t. Unlike read_seqcount_begin(), this function will not wait -@@ -352,20 +395,17 @@ static inline unsigned raw_read_seqcount_t(const seqcount_t *s) - * Return: count to be passed to read_seqcount_retry() - */ - #define raw_seqcount_begin(s) \ -- raw_seqcount_t_begin(__seqcount_ptr(s)) -- --static inline unsigned raw_seqcount_t_begin(const seqcount_t *s) --{ -- /* -- * If the counter is odd, let read_seqcount_retry() fail -- * by decrementing the counter. -- */ -- return raw_read_seqcount_t(s) & ~1; --} -+({ \ -+ /* \ -+ * If the counter is odd, let read_seqcount_retry() fail \ -+ * by decrementing the counter. \ -+ */ \ -+ raw_read_seqcount(s) & ~1; \ -+}) ++#endif /* !PREEMPT_RT */ ++ + /* + * Pull the atomic_t declaration: + * (asm-mips/atomic.h needs above definitions) +diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h +index 19a9be9d97ee..da38149f2843 100644 +--- a/include/linux/spinlock_api_smp.h ++++ b/include/linux/spinlock_api_smp.h +@@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) + return 0; + } - /** - * __read_seqcount_retry() - end a seqcount_t read section w/o barrier -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * @start: count, from read_seqcount_begin() - * - * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() -@@ -389,7 +429,7 @@ static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start) - - /** - * read_seqcount_retry() - end a seqcount_t read critical section -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * @start: count, from read_seqcount_begin() - * - * read_seqcount_retry closes the read critical section of given -@@ -409,7 +449,7 @@ static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start) - - /** - * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - */ - #define raw_write_seqcount_begin(s) \ - do { \ -@@ -428,7 +468,7 @@ static inline void raw_write_seqcount_t_begin(seqcount_t *s) - - /** - * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - */ - #define raw_write_seqcount_end(s) \ - do { \ -@@ -448,7 +488,7 @@ static inline void raw_write_seqcount_t_end(seqcount_t *s) - /** - * write_seqcount_begin_nested() - start a seqcount_t write section with - * custom lockdep nesting level -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * @subclass: lockdep nesting level - * - * See Documentation/locking/lockdep-design.rst -@@ -471,7 +511,7 @@ static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass) - - /** - * write_seqcount_begin() - start a seqcount_t write side critical section -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * write_seqcount_begin opens a write side critical section of the given - * seqcount_t. -@@ -497,7 +537,7 @@ static inline void write_seqcount_t_begin(seqcount_t *s) - - /** - * write_seqcount_end() - end a seqcount_t write side critical section -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * The write section must've been opened with write_seqcount_begin(). - */ -@@ -517,7 +557,7 @@ static inline void write_seqcount_t_end(seqcount_t *s) - - /** - * raw_write_seqcount_barrier() - do a seqcount_t write barrier -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * This can be used to provide an ordering guarantee instead of the usual - * consistency guarantee. It is one wmb cheaper, because it can collapse -@@ -571,7 +611,7 @@ static inline void raw_write_seqcount_t_barrier(seqcount_t *s) - /** - * write_seqcount_invalidate() - invalidate in-progress seqcount_t read - * side operations -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * After write_seqcount_invalidate, no seqcount_t read side operations - * will complete successfully and see data older than this. -@@ -587,34 +627,73 @@ static inline void write_seqcount_t_invalidate(seqcount_t *s) - kcsan_nestable_atomic_end(); - } - --/** -- * raw_read_seqcount_latch() - pick even/odd seqcount_t latch data copy -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+/* -+ * Latch sequence counters (seqcount_latch_t) - * -- * Use seqcount_t latching to switch between two storage places protected -- * by a sequence counter. Doing so allows having interruptible, preemptible, -- * seqcount_t write side critical sections. -+ * A sequence counter variant where the counter even/odd value is used to -+ * switch between two copies of protected data. This allows the read path, -+ * typically NMIs, to safely interrupt the write side critical section. -+ * -+ * As the write sections are fully preemptible, no special handling for -+ * PREEMPT_RT is needed. -+ */ -+typedef struct { -+ seqcount_t seqcount; -+} seqcount_latch_t; -+ -+/** -+ * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t -+ * @seq_name: Name of the seqcount_latch_t instance -+ */ -+#define SEQCNT_LATCH_ZERO(seq_name) { \ -+ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ -+} -+ -+/** -+ * seqcount_latch_init() - runtime initializer for seqcount_latch_t -+ * @s: Pointer to the seqcount_latch_t instance -+ */ -+static inline void seqcount_latch_init(seqcount_latch_t *s) -+{ -+ seqcount_init(&s->seqcount); -+} -+ -+/** -+ * raw_read_seqcount_latch() - pick even/odd latch data copy -+ * @s: Pointer to seqcount_latch_t - * -- * Check raw_write_seqcount_latch() for more details and a full reader and -- * writer usage example. -+ * See raw_write_seqcount_latch() for details and a full reader/writer -+ * usage example. - * - * Return: sequence counter raw value. Use the lowest bit as an index for -- * picking which data copy to read. The full counter value must then be -- * checked with read_seqcount_retry(). -+ * picking which data copy to read. The full counter must then be checked -+ * with read_seqcount_latch_retry(). - */ --#define raw_read_seqcount_latch(s) \ -- raw_read_seqcount_t_latch(__seqcount_ptr(s)) -+static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) -+{ -+ /* -+ * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). -+ * Due to the dependent load, a full smp_rmb() is not needed. -+ */ -+ return READ_ONCE(s->seqcount.sequence); -+} - --static inline int raw_read_seqcount_t_latch(seqcount_t *s) -+/** -+ * read_seqcount_latch_retry() - end a seqcount_latch_t read section -+ * @s: Pointer to seqcount_latch_t -+ * @start: count, from raw_read_seqcount_latch() -+ * -+ * Return: true if a read section retry is required, else false -+ */ -+static inline int -+read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) - { -- /* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */ -- int seq = READ_ONCE(s->sequence); /* ^^^ */ -- return seq; -+ return read_seqcount_retry(&s->seqcount, start); - } - - /** -- * raw_write_seqcount_latch() - redirect readers to even/odd copy -- * @s: Pointer to seqcount_t or any of the seqcount_locktype_t variants -+ * raw_write_seqcount_latch() - redirect latch readers to even/odd copy -+ * @s: Pointer to seqcount_latch_t - * - * The latch technique is a multiversion concurrency control method that allows - * queries during non-atomic modifications. If you can guarantee queries never -@@ -633,7 +712,7 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) - * The basic form is a data structure like:: - * - * struct latch_struct { -- * seqcount_t seq; -+ * seqcount_latch_t seq; - * struct data_struct data[2]; - * }; - * -@@ -643,13 +722,13 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) - * void latch_modify(struct latch_struct *latch, ...) - * { - * smp_wmb(); // Ensure that the last data[1] update is visible -- * latch->seq++; -+ * latch->seq.sequence++; - * smp_wmb(); // Ensure that the seqcount update is visible - * - * modify(latch->data[0], ...); - * - * smp_wmb(); // Ensure that the data[0] update is visible -- * latch->seq++; -+ * latch->seq.sequence++; - * smp_wmb(); // Ensure that the seqcount update is visible - * - * modify(latch->data[1], ...); -@@ -668,8 +747,8 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) - * idx = seq & 0x01; - * entry = data_query(latch->data[idx], ...); - * -- * // read_seqcount_retry() includes needed smp_rmb() -- * } while (read_seqcount_retry(&latch->seq, seq)); -+ * // This includes needed smp_rmb() -+ * } while (read_seqcount_latch_retry(&latch->seq, seq)); - * - * return entry; - * } -@@ -688,19 +767,16 @@ static inline int raw_read_seqcount_t_latch(seqcount_t *s) - * to miss an entire modification sequence, once it resumes it might - * observe the new entry. - * -- * NOTE: -+ * NOTE2: - * - * When data is a dynamic data structure; one should use regular RCU - * patterns to manage the lifetimes of the objects within. - */ --#define raw_write_seqcount_latch(s) \ -- raw_write_seqcount_t_latch(__seqcount_ptr(s)) -- --static inline void raw_write_seqcount_t_latch(seqcount_t *s) -+static inline void raw_write_seqcount_latch(seqcount_latch_t *s) - { -- smp_wmb(); /* prior stores before incrementing "sequence" */ -- s->sequence++; -- smp_wmb(); /* increment "sequence" before following stores */ -+ smp_wmb(); /* prior stores before incrementing "sequence" */ -+ s->seqcount.sequence++; -+ smp_wmb(); /* increment "sequence" before following stores */ - } - - /* -@@ -714,13 +790,17 @@ static inline void raw_write_seqcount_t_latch(seqcount_t *s) - * - Documentation/locking/seqlock.rst - */ - typedef struct { -- struct seqcount seqcount; -+ /* -+ * Make sure that readers don't starve writers on PREEMPT_RT: use -+ * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK(). -+ */ -+ seqcount_spinlock_t seqcount; - spinlock_t lock; - } seqlock_t; - - #define __SEQLOCK_UNLOCKED(lockname) \ - { \ -- .seqcount = SEQCNT_ZERO(lockname), \ -+ .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ - .lock = __SPIN_LOCK_UNLOCKED(lockname) \ - } - -@@ -730,8 +810,8 @@ typedef struct { - */ - #define seqlock_init(sl) \ - do { \ -- seqcount_init(&(sl)->seqcount); \ - spin_lock_init(&(sl)->lock); \ -+ seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock); \ - } while (0) - - /** -@@ -778,6 +858,12 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) - return read_seqcount_retry(&sl->seqcount, start); - } - -+/* -+ * For all seqlock_t write side functions, use write_seqcount_*t*_begin() -+ * instead of the generic write_seqcount_begin(). This way, no redundant -+ * lockdep_assert_held() checks are added. -+ */ -+ - /** - * write_seqlock() - start a seqlock_t write side critical section - * @sl: Pointer to seqlock_t -@@ -794,7 +880,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) - static inline void write_seqlock(seqlock_t *sl) - { - spin_lock(&sl->lock); -- write_seqcount_t_begin(&sl->seqcount); -+ write_seqcount_t_begin(&sl->seqcount.seqcount); - } - - /** -@@ -806,7 +892,7 @@ static inline void write_seqlock(seqlock_t *sl) - */ - static inline void write_sequnlock(seqlock_t *sl) - { -- write_seqcount_t_end(&sl->seqcount); -+ write_seqcount_t_end(&sl->seqcount.seqcount); - spin_unlock(&sl->lock); - } - -@@ -820,7 +906,7 @@ static inline void write_sequnlock(seqlock_t *sl) - static inline void write_seqlock_bh(seqlock_t *sl) - { - spin_lock_bh(&sl->lock); -- write_seqcount_t_begin(&sl->seqcount); -+ write_seqcount_t_begin(&sl->seqcount.seqcount); - } - - /** -@@ -833,7 +919,7 @@ static inline void write_seqlock_bh(seqlock_t *sl) - */ - static inline void write_sequnlock_bh(seqlock_t *sl) - { -- write_seqcount_t_end(&sl->seqcount); -+ write_seqcount_t_end(&sl->seqcount.seqcount); - spin_unlock_bh(&sl->lock); - } - -@@ -847,7 +933,7 @@ static inline void write_sequnlock_bh(seqlock_t *sl) - static inline void write_seqlock_irq(seqlock_t *sl) - { - spin_lock_irq(&sl->lock); -- write_seqcount_t_begin(&sl->seqcount); -+ write_seqcount_t_begin(&sl->seqcount.seqcount); - } - - /** -@@ -859,7 +945,7 @@ static inline void write_seqlock_irq(seqlock_t *sl) - */ - static inline void write_sequnlock_irq(seqlock_t *sl) - { -- write_seqcount_t_end(&sl->seqcount); -+ write_seqcount_t_end(&sl->seqcount.seqcount); - spin_unlock_irq(&sl->lock); - } - -@@ -868,7 +954,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) - unsigned long flags; - - spin_lock_irqsave(&sl->lock, flags); -- write_seqcount_t_begin(&sl->seqcount); -+ write_seqcount_t_begin(&sl->seqcount.seqcount); - return flags; - } - -@@ -897,7 +983,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) - static inline void - write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) - { -- write_seqcount_t_end(&sl->seqcount); -+ write_seqcount_t_end(&sl->seqcount.seqcount); - spin_unlock_irqrestore(&sl->lock, flags); - } - -diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h -index 2b70f736b091..68d756373b53 100644 ---- a/include/linux/serial_8250.h -+++ b/include/linux/serial_8250.h -@@ -7,6 +7,7 @@ - #ifndef _LINUX_SERIAL_8250_H - #define _LINUX_SERIAL_8250_H - -+#include <linux/atomic.h> - #include <linux/serial_core.h> - #include <linux/serial_reg.h> - #include <linux/platform_device.h> -@@ -125,6 +126,8 @@ struct uart_8250_port { - #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA - unsigned char msr_saved_flags; - -+ atomic_t console_printing; -+ - struct uart_8250_dma *dma; - const struct uart_8250_ops *ops; - -@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); - void serial8250_set_defaults(struct uart_8250_port *up); - void serial8250_console_write(struct uart_8250_port *up, const char *s, - unsigned int count); -+void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, -+ unsigned int count); - int serial8250_console_setup(struct uart_port *port, char *options, bool probe); - int serial8250_console_exit(struct uart_port *port); - -diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h -index a5a5d1d4d7b1..0470d1582b09 100644 ---- a/include/linux/shmem_fs.h -+++ b/include/linux/shmem_fs.h -@@ -31,7 +31,7 @@ struct shmem_sb_info { - struct percpu_counter used_blocks; /* How many are allocated */ - unsigned long max_inodes; /* How many inodes are allowed */ - unsigned long free_inodes; /* How many are left for allocation */ -- spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ -+ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ - umode_t mode; /* Mount mode for root directory */ - unsigned char huge; /* Whether to try for hugepages */ - kuid_t uid; /* Mount uid for root directory */ -diff --git a/include/linux/signal.h b/include/linux/signal.h -index 7bbc0e9cf084..3030d984a144 100644 ---- a/include/linux/signal.h -+++ b/include/linux/signal.h -@@ -263,6 +263,7 @@ static inline void init_sigpending(struct sigpending *sig) - } - - extern void flush_sigqueue(struct sigpending *queue); -+extern void flush_task_sigqueue(struct task_struct *tsk); - - /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ - static inline int valid_signal(unsigned long sig) -diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h -index 04a18e01b362..187ea2e1854e 100644 ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -295,6 +295,7 @@ struct sk_buff_head { - - __u32 qlen; - spinlock_t lock; -+ raw_spinlock_t raw_lock; - }; - - struct sk_buff; -@@ -1884,6 +1885,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) - __skb_queue_head_init(list); - } - -+static inline void skb_queue_head_init_raw(struct sk_buff_head *list) -+{ -+ raw_spin_lock_init(&list->raw_lock); -+ __skb_queue_head_init(list); -+} -+ - static inline void skb_queue_head_init_class(struct sk_buff_head *list, - struct lock_class_key *class) - { -diff --git a/include/linux/smp.h b/include/linux/smp.h -index 80d557ef8a11..47d666fa4fba 100644 ---- a/include/linux/smp.h -+++ b/include/linux/smp.h -@@ -236,6 +236,9 @@ static inline int get_boot_cpu_id(void) - #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) - #define put_cpu() preempt_enable() - -+#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) -+#define put_cpu_light() migrate_enable() -+ - /* - * Callback to arch code if there's nosmp or maxcpus=0 on the - * boot command line: -diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h -index f2f12d746dbd..38ff58065dfb 100644 ---- a/include/linux/spinlock.h -+++ b/include/linux/spinlock.h -@@ -309,7 +309,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) - }) - - /* Include rwlock functions */ --#include <linux/rwlock.h> -+#ifdef CONFIG_PREEMPT_RT -+# include <linux/rwlock_rt.h> -+#else -+# include <linux/rwlock.h> -+#endif - - /* - * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: -@@ -320,6 +324,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) - # include <linux/spinlock_api_up.h> - #endif - -+#ifdef CONFIG_PREEMPT_RT -+# include <linux/spinlock_rt.h> -+#else /* PREEMPT_RT */ -+ - /* - * Map the spin_lock functions to the raw variants for PREEMPT_RT=n - */ -@@ -454,6 +462,8 @@ static __always_inline int spin_is_contended(spinlock_t *lock) - - #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) - -+#endif /* !PREEMPT_RT */ -+ - /* - * Pull the atomic_t declaration: - * (asm-mips/atomic.h needs above definitions) -diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h -index 19a9be9d97ee..da38149f2843 100644 ---- a/include/linux/spinlock_api_smp.h -+++ b/include/linux/spinlock_api_smp.h -@@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) - return 0; - } - --#include <linux/rwlock_api_smp.h> -+#ifndef CONFIG_PREEMPT_RT -+# include <linux/rwlock_api_smp.h> -+#endif +-#include <linux/rwlock_api_smp.h> ++#ifndef CONFIG_PREEMPT_RT ++# include <linux/rwlock_api_smp.h> ++#endif #endif /* __LINUX_SPINLOCK_API_SMP_H */ diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h @@ -7987,64 +7144,32 @@ index 000000000000..446da786e5d5 + +#endif diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h -index c09b6407ae1b..b0243ba07fb7 100644 +index c09b6407ae1b..d9b371fa13e0 100644 --- a/include/linux/spinlock_types_up.h +++ b/include/linux/spinlock_types_up.h -@@ -1,10 +1,6 @@ +@@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_TYPES_UP_H #define __LINUX_SPINLOCK_TYPES_UP_H -#ifndef __LINUX_SPINLOCK_TYPES_H --# error "please don't include this file directly" --#endif -- - /* - * include/linux/spinlock_types_up.h - spinlock type definitions for UP - * -diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h -index 76d8b09384a7..30577c3aecf8 100644 ---- a/include/linux/stop_machine.h -+++ b/include/linux/stop_machine.h -@@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg); - struct cpu_stop_work { - struct list_head list; /* cpu_stopper->works */ - cpu_stop_fn_t fn; -+ unsigned long caller; - void *arg; - struct cpu_stop_done *done; - }; -@@ -36,6 +37,8 @@ void stop_machine_park(int cpu); - void stop_machine_unpark(int cpu); - void stop_machine_yield(const struct cpumask *cpumask); - -+extern void print_stop_info(const char *log_lvl, struct task_struct *task); -+ - #else /* CONFIG_SMP */ - - #include <linux/workqueue.h> -@@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, - return false; - } - -+static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } -+ - #endif /* CONFIG_SMP */ ++#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) + # error "please don't include this file directly" + #endif - /* diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h -index e93e249a4e9b..c88b9cecc78a 100644 +index 9b2158c69275..8d1cac4052f2 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h -@@ -97,7 +97,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) - #define test_thread_flag(flag) \ - test_ti_thread_flag(current_thread_info(), flag) +@@ -149,7 +149,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) + clear_ti_thread_flag(task_thread_info(t), TIF_##fl) + #endif /* !CONFIG_GENERIC_ENTRY */ -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef CONFIG_PREEMPT_LAZY +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ + test_thread_flag(TIF_NEED_RESCHED_LAZY)) +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) -+#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY)) ++#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY) + +#else +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) @@ -8055,7 +7180,7 @@ index e93e249a4e9b..c88b9cecc78a 100644 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h -index 5c6943354049..75e3acd59635 100644 +index d321fe5ad1a1..89c3f7162267 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -67,6 +67,8 @@ struct trace_entry { @@ -8067,6 +7192,100 @@ index 5c6943354049..75e3acd59635 100644 }; #define TRACE_EVENT_TYPE_MAX \ +@@ -148,17 +150,78 @@ enum print_line_t { + + enum print_line_t trace_handle_return(struct trace_seq *s); + +-void tracing_generic_entry_update(struct trace_entry *entry, +- unsigned short type, +- unsigned long flags, +- int pc); ++static inline void tracing_generic_entry_update(struct trace_entry *entry, ++ unsigned short type, ++ unsigned int trace_ctx) ++{ ++ entry->preempt_count = trace_ctx & 0xff; ++ entry->migrate_disable = (trace_ctx >> 8) & 0xff; ++ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; ++ entry->pid = current->pid; ++ entry->type = type; ++ entry->flags = trace_ctx >> 24; ++} ++ ++unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); ++ ++enum trace_flag_type { ++ TRACE_FLAG_IRQS_OFF = 0x01, ++ TRACE_FLAG_IRQS_NOSUPPORT = 0x02, ++ TRACE_FLAG_NEED_RESCHED = 0x04, ++ TRACE_FLAG_HARDIRQ = 0x08, ++ TRACE_FLAG_SOFTIRQ = 0x10, ++ TRACE_FLAG_PREEMPT_RESCHED = 0x20, ++ TRACE_FLAG_NMI = 0x40, ++ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, ++}; ++ ++#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT ++static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++{ ++ unsigned int irq_status = irqs_disabled_flags(irqflags) ? ++ TRACE_FLAG_IRQS_OFF : 0; ++ return tracing_gen_ctx_irq_test(irq_status); ++} ++static inline unsigned int tracing_gen_ctx(void) ++{ ++ unsigned long irqflags; ++ ++ local_save_flags(irqflags); ++ return tracing_gen_ctx_flags(irqflags); ++} ++#else ++ ++static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++{ ++ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++} ++static inline unsigned int tracing_gen_ctx(void) ++{ ++ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++} ++#endif ++ ++static inline unsigned int tracing_gen_ctx_dec(void) ++{ ++ unsigned int trace_ctx; ++ ++ trace_ctx = tracing_gen_ctx(); ++ /* ++ * Subtract one from the preeption counter if preemption is enabled, ++ * see trace_event_buffer_reserve()for details. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ trace_ctx--; ++ return trace_ctx; ++} ++ + struct trace_event_file; + + struct ring_buffer_event * + trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer, + struct trace_event_file *trace_file, + int type, unsigned long len, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + + #define TRACE_RECORD_CMDLINE BIT(0) + #define TRACE_RECORD_TGID BIT(1) +@@ -232,8 +295,7 @@ struct trace_event_buffer { + struct ring_buffer_event *event; + struct trace_event_file *trace_file; + void *entry; +- unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + struct pt_regs *regs; + }; + diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index c6abb79501b3..72bf38b97df8 100644 --- a/include/linux/u64_stats_sync.h @@ -8196,7 +7415,7 @@ index c6abb79501b3..72bf38b97df8 100644 #endif return __u64_stats_fetch_retry(syncp, start); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h -index 7557c1070fd7..ed4b63184327 100644 +index 773135fc6e19..1a2cedbc72e6 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -63,7 +63,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); @@ -8220,7 +7439,7 @@ index 7557c1070fd7..ed4b63184327 100644 static inline void count_vm_events(enum vm_event_item item, long delta) diff --git a/include/linux/wait.h b/include/linux/wait.h -index 27fb99cfeb02..93b42387b4c6 100644 +index fe10e8570a52..e9ce878a4906 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -10,6 +10,7 @@ @@ -8250,6 +7469,33 @@ index 850424e5d030..8ef2feb0d8fe 100644 struct ww_acquire_ctx { struct task_struct *task; unsigned long stamp; +diff --git a/include/linux/zpool.h b/include/linux/zpool.h +index 51bf43076165..e8997010612a 100644 +--- a/include/linux/zpool.h ++++ b/include/linux/zpool.h +@@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); + * @malloc: allocate mem from a pool. + * @free: free mem from a pool. + * @shrink: shrink the pool. ++ * @sleep_mapped: whether zpool driver can sleep during map. + * @map: map a handle. + * @unmap: unmap a handle. + * @total_size: get total size of a pool. +@@ -100,6 +101,7 @@ struct zpool_driver { + int (*shrink)(void *pool, unsigned int pages, + unsigned int *reclaimed); + ++ bool sleep_mapped; + void *(*map)(void *pool, unsigned long handle, + enum zpool_mapmode mm); + void (*unmap)(void *pool, unsigned long handle); +@@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); + int zpool_unregister_driver(struct zpool_driver *driver); + + bool zpool_evictable(struct zpool *pool); ++bool zpool_can_sleep_mapped(struct zpool *pool); + + #endif diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 1424e02cef90..163f8415e5db 100644 --- a/include/net/gen_stats.h @@ -8319,7 +7565,7 @@ index 000000000000..67710bace741 + +#endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h -index d60e7c39d60c..40be4443b6bd 100644 +index 5b490b5591df..b2aecc88c796 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -10,6 +10,7 @@ @@ -8339,7 +7585,7 @@ index d60e7c39d60c..40be4443b6bd 100644 struct gnet_stats_queue qstats; unsigned long state; struct Qdisc *next_sched; -@@ -138,7 +139,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) +@@ -141,7 +142,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); @@ -8351,7 +7597,7 @@ index d60e7c39d60c..40be4443b6bd 100644 } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) -@@ -162,17 +167,35 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) +@@ -165,17 +170,35 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) } else if (qdisc_is_running(qdisc)) { return false; } @@ -8362,7 +7608,7 @@ index d60e7c39d60c..40be4443b6bd 100644 + * Variant of write_seqcount_t_begin() telling lockdep that a + * trylock was attempted. + */ -+ raw_write_seqcount_t_begin(s); ++ do_raw_write_seqcount_begin(s); + seqcount_acquire(&s->dep_map, 0, 1, _RET_IP_); + return true; + } @@ -8387,7 +7633,7 @@ index d60e7c39d60c..40be4443b6bd 100644 if (qdisc->flags & TCQ_F_NOLOCK) spin_unlock(&qdisc->seqlock); } -@@ -547,7 +570,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) +@@ -538,7 +561,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) return qdisc_lock(root); } @@ -8396,44 +7642,11 @@ index d60e7c39d60c..40be4443b6bd 100644 { struct Qdisc *root = qdisc_root_sleeping(qdisc); -diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h -index fec25b9cfbaf..f9bf9d8f30b8 100644 ---- a/include/trace/events/sched.h -+++ b/include/trace/events/sched.h -@@ -646,6 +646,18 @@ DECLARE_TRACE(sched_update_nr_running_tp, - TP_PROTO(struct rq *rq, int change), - TP_ARGS(rq, change)); - -+DECLARE_TRACE(sched_migrate_disable_tp, -+ TP_PROTO(struct task_struct *p), -+ TP_ARGS(p)); -+ -+DECLARE_TRACE(sched_migrate_enable_tp, -+ TP_PROTO(struct task_struct *p), -+ TP_ARGS(p)); -+ -+DECLARE_TRACE(sched_migrate_pull_tp, -+ TP_PROTO(struct task_struct *p), -+ TP_ARGS(p)); -+ - #endif /* _TRACE_SCHED_H */ - - /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig -index d6a0b31b13dc..c48887283f88 100644 +index 29ad68325028..77d356fa8668 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -682,7 +682,8 @@ config IKHEADERS - - config LOG_BUF_SHIFT - int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" -- range 12 25 -+ range 12 25 if !H8300 -+ range 12 19 if H8300 - default 17 - depends on PRINTK - help -@@ -964,6 +965,7 @@ config CFS_BANDWIDTH +@@ -968,6 +968,7 @@ config CFS_BANDWIDTH config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" depends on CGROUP_SCHED @@ -8441,7 +7654,7 @@ index d6a0b31b13dc..c48887283f88 100644 default n help This feature lets you explicitly allocate real CPU bandwidth -@@ -1871,6 +1873,7 @@ choice +@@ -1884,6 +1885,7 @@ choice config SLAB bool "SLAB" @@ -8449,7 +7662,7 @@ index d6a0b31b13dc..c48887283f88 100644 select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work -@@ -1891,6 +1894,7 @@ config SLUB +@@ -1904,6 +1906,7 @@ config SLUB config SLOB depends on EXPERT bool "SLOB (Simple Allocator)" @@ -8457,15 +7670,15 @@ index d6a0b31b13dc..c48887283f88 100644 help SLOB replaces the stock allocator with a drastically simpler allocator. SLOB is generally more space efficient but -@@ -1957,7 +1961,7 @@ config SHUFFLE_PAGE_ALLOCATOR +@@ -1969,7 +1972,7 @@ config SHUFFLE_PAGE_ALLOCATOR + Say Y if unsure. config SLUB_CPU_PARTIAL - default y -- depends on SLUB && SMP -+ depends on SLUB && SMP && !PREEMPT_RT +- default y ++ default y if !PREEMPT_RT + depends on SLUB && SMP bool "SLUB per cpu partial cache" help - Per cpu partial caches accelerate objects allocation and freeing diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 3de8fd11873b..4198f0273ecd 100644 --- a/kernel/Kconfig.locks @@ -8480,7 +7693,7 @@ index 3de8fd11873b..4198f0273ecd 100644 config ARCH_HAS_MMIOWB bool diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index bf82259cff96..c1203c14fee9 100644 +index bf82259cff96..b5cd1e278eb5 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,5 +1,11 @@ @@ -8495,8 +7708,16 @@ index bf82259cff96..c1203c14fee9 100644 choice prompt "Preemption Model" default PREEMPT_NONE +@@ -59,6 +65,7 @@ config PREEMPT_RT + bool "Fully Preemptible Kernel (Real-Time)" + depends on EXPERT && ARCH_SUPPORTS_RT + select PREEMPTION ++ select RT_MUTEXES + help + This option turns the kernel into a real-time kernel by replacing + various locking primitives (spinlocks, rwlocks, etc.) with diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 642415b8c3c9..daca5e74d75e 100644 +index 53c70c470a38..8f4b2b9aa06c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -345,7 +345,7 @@ void cpuset_read_unlock(void) @@ -8508,7 +7729,7 @@ index 642415b8c3c9..daca5e74d75e 100644 static struct workqueue_struct *cpuset_migrate_mm_wq; -@@ -1257,7 +1257,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, +@@ -1280,7 +1280,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ @@ -8517,7 +7738,7 @@ index 642415b8c3c9..daca5e74d75e 100644 if (adding) { cpumask_or(parent->subparts_cpus, parent->subparts_cpus, tmp->addmask); -@@ -1276,7 +1276,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, +@@ -1299,7 +1299,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, } parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); @@ -8526,7 +7747,7 @@ index 642415b8c3c9..daca5e74d75e 100644 return cmd == partcmd_update; } -@@ -1381,7 +1381,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) +@@ -1404,7 +1404,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) continue; rcu_read_unlock(); @@ -8535,7 +7756,7 @@ index 642415b8c3c9..daca5e74d75e 100644 cpumask_copy(cp->effective_cpus, tmp->new_cpus); if (cp->nr_subparts_cpus && -@@ -1412,7 +1412,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) +@@ -1435,7 +1435,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) = cpumask_weight(cp->subparts_cpus); } } @@ -8544,7 +7765,7 @@ index 642415b8c3c9..daca5e74d75e 100644 WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); -@@ -1530,7 +1530,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, +@@ -1553,7 +1553,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EINVAL; } @@ -8553,7 +7774,7 @@ index 642415b8c3c9..daca5e74d75e 100644 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); /* -@@ -1541,7 +1541,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, +@@ -1564,7 +1564,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } @@ -8562,7 +7783,7 @@ index 642415b8c3c9..daca5e74d75e 100644 update_cpumasks_hier(cs, &tmp); -@@ -1735,9 +1735,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) +@@ -1758,9 +1758,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) continue; rcu_read_unlock(); @@ -8574,7 +7795,7 @@ index 642415b8c3c9..daca5e74d75e 100644 WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); -@@ -1805,9 +1805,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, +@@ -1828,9 +1828,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; @@ -8586,7 +7807,7 @@ index 642415b8c3c9..daca5e74d75e 100644 /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); -@@ -1898,9 +1898,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +@@ -1921,9 +1921,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); @@ -8598,7 +7819,7 @@ index 642415b8c3c9..daca5e74d75e 100644 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) rebuild_sched_domains_locked(); -@@ -2409,7 +2409,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) +@@ -2432,7 +2432,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) cpuset_filetype_t type = seq_cft(sf)->private; int ret = 0; @@ -8607,7 +7828,7 @@ index 642415b8c3c9..daca5e74d75e 100644 switch (type) { case FILE_CPULIST: -@@ -2431,7 +2431,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) +@@ -2454,7 +2454,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) ret = -EINVAL; } @@ -8616,7 +7837,7 @@ index 642415b8c3c9..daca5e74d75e 100644 return ret; } -@@ -2744,14 +2744,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) +@@ -2767,14 +2767,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); @@ -8633,7 +7854,7 @@ index 642415b8c3c9..daca5e74d75e 100644 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; -@@ -2778,12 +2778,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) +@@ -2801,12 +2801,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) } rcu_read_unlock(); @@ -8648,7 +7869,7 @@ index 642415b8c3c9..daca5e74d75e 100644 out_unlock: percpu_up_write(&cpuset_rwsem); put_online_cpus(); -@@ -2839,7 +2839,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) +@@ -2862,7 +2862,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { percpu_down_write(&cpuset_rwsem); @@ -8657,7 +7878,7 @@ index 642415b8c3c9..daca5e74d75e 100644 if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); -@@ -2850,7 +2850,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) +@@ -2873,7 +2873,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) top_cpuset.mems_allowed = top_cpuset.effective_mems; } @@ -8666,7 +7887,7 @@ index 642415b8c3c9..daca5e74d75e 100644 percpu_up_write(&cpuset_rwsem); } -@@ -2947,12 +2947,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, +@@ -2970,12 +2970,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, { bool is_empty; @@ -8681,7 +7902,7 @@ index 642415b8c3c9..daca5e74d75e 100644 /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, -@@ -2989,10 +2989,10 @@ hotplug_update_tasks(struct cpuset *cs, +@@ -3012,10 +3012,10 @@ hotplug_update_tasks(struct cpuset *cs, if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; @@ -8694,7 +7915,7 @@ index 642415b8c3c9..daca5e74d75e 100644 if (cpus_updated) update_tasks_cpumask(cs); -@@ -3147,7 +3147,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) +@@ -3170,7 +3170,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { @@ -8703,7 +7924,7 @@ index 642415b8c3c9..daca5e74d75e 100644 if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); /* -@@ -3167,17 +3167,17 @@ static void cpuset_hotplug_workfn(struct work_struct *work) +@@ -3190,17 +3190,17 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); @@ -8724,7 +7945,7 @@ index 642415b8c3c9..daca5e74d75e 100644 update_tasks_nodemask(&top_cpuset); } -@@ -3278,11 +3278,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +@@ -3301,11 +3301,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; @@ -8738,7 +7959,7 @@ index 642415b8c3c9..daca5e74d75e 100644 } /** -@@ -3343,11 +3343,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) +@@ -3366,11 +3366,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) nodemask_t mask; unsigned long flags; @@ -8752,7 +7973,7 @@ index 642415b8c3c9..daca5e74d75e 100644 return mask; } -@@ -3439,14 +3439,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) +@@ -3462,14 +3462,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ @@ -8793,38 +8014,46 @@ index d51175cedfca..b424f3157b34 100644 /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || -diff --git a/kernel/cpu.c b/kernel/cpu.c -index 6ff2578ecf17..fa535eaa4826 100644 ---- a/kernel/cpu.c -+++ b/kernel/cpu.c -@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { - .name = "ap:online", - }, - /* -- * Handled on controll processor until the plugged processor manages -+ * Handled on control processor until the plugged processor manages - * this itself. - */ - [CPUHP_TEARDOWN_CPU] = { -@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { - .teardown.single = takedown_cpu, - .cant_stop = true, - }, -+ -+ [CPUHP_AP_SCHED_WAIT_EMPTY] = { -+ .name = "sched:waitempty", -+ .startup.single = NULL, -+ .teardown.single = sched_cpu_wait_empty, -+ }, -+ - /* Handle smpboot threads park/unpark */ - [CPUHP_AP_SMPBOOT_THREADS] = { - .name = "smpboot/threads:online", +diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c +index 930ac1b25ec7..dbf1d126ac5e 100644 +--- a/kernel/debug/kdb/kdb_main.c ++++ b/kernel/debug/kdb/kdb_main.c +@@ -2101,7 +2101,7 @@ static int kdb_dmesg(int argc, const char **argv) + int adjust = 0; + int n = 0; + int skip = 0; +- struct kmsg_dumper dumper = { .active = 1 }; ++ struct kmsg_dumper_iter iter = { .active = 1 }; + size_t len; + char buf[201]; + +@@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const char **argv) + kdb_set(2, setargs); + } + +- kmsg_dump_rewind_nolock(&dumper); +- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) ++ kmsg_dump_rewind(&iter); ++ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL)) + n++; + + if (lines < 0) { +@@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const char **argv) + if (skip >= n || skip < 0) + return 0; + +- kmsg_dump_rewind_nolock(&dumper); +- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { ++ kmsg_dump_rewind(&iter); ++ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) { + if (skip) { + skip--; + continue; diff --git a/kernel/entry/common.c b/kernel/entry/common.c -index 6fdb6105e6d6..adbfcef76991 100644 +index f9d491b17b78..50ba2857c286 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c -@@ -148,9 +148,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, +@@ -158,9 +158,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); @@ -8843,7 +8072,7 @@ index 6fdb6105e6d6..adbfcef76991 100644 if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); -@@ -354,7 +362,7 @@ void irqentry_exit_cond_resched(void) +@@ -381,7 +389,7 @@ void irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); @@ -8853,10 +8082,10 @@ index 6fdb6105e6d6..adbfcef76991 100644 } } diff --git a/kernel/exit.c b/kernel/exit.c -index 733e80f334e7..a1756dcc17f2 100644 +index 04029e35e69a..346f7b76ceca 100644 --- a/kernel/exit.c +++ b/kernel/exit.c -@@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ @@ -8866,7 +8095,7 @@ index 733e80f334e7..a1756dcc17f2 100644 spin_unlock(&sighand->siglock); diff --git a/kernel/fork.c b/kernel/fork.c -index da8d360fb032..2cf99526192e 100644 +index a2addc21d63f..11e5d05c9640 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ @@ -8877,7 +8106,7 @@ index da8d360fb032..2cf99526192e 100644 #include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> -@@ -287,7 +288,7 @@ static inline void free_thread_stack(struct task_struct *tsk) +@@ -288,7 +289,7 @@ static inline void free_thread_stack(struct task_struct *tsk) return; } @@ -8886,7 +8115,7 @@ index da8d360fb032..2cf99526192e 100644 return; } #endif -@@ -687,6 +688,19 @@ void __mmdrop(struct mm_struct *mm) +@@ -689,6 +690,19 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); @@ -8906,7 +8135,7 @@ index da8d360fb032..2cf99526192e 100644 static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; -@@ -728,6 +742,15 @@ void __put_task_struct(struct task_struct *tsk) +@@ -730,6 +744,15 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); @@ -8919,10 +8148,10 @@ index da8d360fb032..2cf99526192e 100644 + /* Task is done with its stack. */ + put_task_stack(tsk); + + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); - security_task_free(tsk); -@@ -924,6 +947,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) +@@ -927,6 +950,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; @@ -8930,7 +8159,7 @@ index da8d360fb032..2cf99526192e 100644 account_kernel_stack(tsk, 1); -@@ -1970,6 +1994,7 @@ static __latent_entropy struct task_struct *copy_process( +@@ -1993,6 +2017,7 @@ static __latent_entropy struct task_struct *copy_process( spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); @@ -8939,10 +8168,10 @@ index da8d360fb032..2cf99526192e 100644 p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME diff --git a/kernel/futex.c b/kernel/futex.c -index a5876694a60e..1ca3581043fe 100644 +index 45a13eb8894e..a1af87a21c03 100644 --- a/kernel/futex.c +++ b/kernel/futex.c -@@ -1479,6 +1479,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ +@@ -1497,6 +1497,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ struct task_struct *new_owner; bool postunlock = false; DEFINE_WAKE_Q(wake_q); @@ -8950,14 +8179,15 @@ index a5876694a60e..1ca3581043fe 100644 int ret = 0; new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); -@@ -1538,13 +1539,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ - pi_state->owner = new_owner; - raw_spin_unlock(&new_owner->pi_lock); +@@ -1546,14 +1547,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); +- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); ++ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, ++ &wake_sleeper_q); + } -- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); -- -+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, -+ &wake_sleeper_q); out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); @@ -8967,7 +8197,7 @@ index a5876694a60e..1ca3581043fe 100644 return ret; } -@@ -2145,6 +2146,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, +@@ -2154,6 +2156,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ requeue_pi_wake_futex(this, &key2, hb2); continue; @@ -8984,7 +8214,7 @@ index a5876694a60e..1ca3581043fe 100644 } else if (ret) { /* * rt_mutex_start_proxy_lock() detected a -@@ -2830,7 +2841,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, +@@ -2847,7 +2859,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, goto no_block; } @@ -8993,16 +8223,16 @@ index a5876694a60e..1ca3581043fe 100644 /* * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not -@@ -3171,7 +3182,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, +@@ -3172,7 +3184,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; + struct futex_hash_bucket *hb, *hb2; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; int res, ret; -@@ -3192,7 +3203,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, +@@ -3193,7 +3205,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ @@ -9011,7 +8241,7 @@ index a5876694a60e..1ca3581043fe 100644 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) -@@ -3223,20 +3234,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, +@@ -3224,20 +3236,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); @@ -9078,7 +8308,7 @@ index a5876694a60e..1ca3581043fe 100644 /* Check if the requeue code acquired the second futex for us. */ if (!q.rt_waiter) { -@@ -3245,7 +3291,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, +@@ -3246,14 +3293,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * did a lock-steal - fix up the PI-state in that case. */ if (q.pi_state && (q.pi_state->owner != current)) { @@ -9086,18 +8316,18 @@ index a5876694a60e..1ca3581043fe 100644 + spin_lock(&hb2->lock); + BUG_ON(&hb2->lock != q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; -@@ -3256,7 +3303,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + /* + * Drop the reference to the pi state which * the requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); - spin_unlock(q.lock_ptr); + spin_unlock(&hb2->lock); - } - } else { - struct rt_mutex *pi_mutex; -@@ -3270,7 +3317,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ++ + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. +@@ -3272,7 +3321,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); @@ -9130,10 +8360,19 @@ index 762a928e18f9..7929fcdb7817 100644 if (!noirqdebug) note_interrupt(desc, retval); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c -index 5df903fccb60..881e13ec9709 100644 +index dec3f73e8db9..b279a8683c38 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c -@@ -1175,6 +1175,12 @@ static int irq_thread(void *data) +@@ -1225,6 +1225,8 @@ static int irq_thread(void *data) + irqreturn_t (*handler_fn)(struct irq_desc *desc, + struct irqaction *action); + ++ sched_set_fifo(current); ++ + if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; +@@ -1245,6 +1247,12 @@ static int irq_thread(void *data) if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); @@ -9146,7 +8385,16 @@ index 5df903fccb60..881e13ec9709 100644 wake_threads_waitq(desc); } -@@ -2711,7 +2717,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); +@@ -1390,8 +1398,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) + if (IS_ERR(t)) + return PTR_ERR(t); + +- sched_set_fifo(t); +- + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code +@@ -2781,7 +2787,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * @@ -9182,7 +8430,7 @@ index f865e5f4d382..dc7311dd74b1 100644 printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); diff --git a/kernel/irq_work.c b/kernel/irq_work.c -index eca83965b631..8183d30e1bb1 100644 +index e8da1e71583a..c3455910196f 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -18,6 +18,7 @@ @@ -9193,48 +8441,58 @@ index eca83965b631..8183d30e1bb1 100644 #include <asm/processor.h> -@@ -52,13 +53,19 @@ void __weak arch_irq_work_raise(void) +@@ -52,13 +53,27 @@ void __weak arch_irq_work_raise(void) /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { -+ struct llist_head *list; -+ bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT); -+ -+ lazy_work = atomic_read(&work->flags) & IRQ_WORK_LAZY; -+ - /* If the work is "lazy", handle it from next tick if any */ -- if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { -- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && +- /* If the work is "lazy", handle it from next tick if any */ +- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { +- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); - } else { -- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) -+ if (lazy_work || (realtime && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ))) +- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) ++ struct llist_head *list; ++ bool lazy_work; ++ int work_flags; ++ ++ work_flags = atomic_read(&work->node.a_flags); ++ if (work_flags & IRQ_WORK_LAZY) ++ lazy_work = true; ++ else if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(work_flags & IRQ_WORK_HARD_IRQ)) ++ lazy_work = true; ++ else ++ lazy_work = false; ++ ++ if (lazy_work) + list = this_cpu_ptr(&lazy_list); + else + list = this_cpu_ptr(&raised_list); + -+ if (llist_add(&work->llnode, list)) { ++ if (llist_add(&work->node.llist, list)) { ++ /* If the work is "lazy", handle it from next tick if any */ + if (!lazy_work || tick_nohz_tick_stopped()) arch_irq_work_raise(); } } -@@ -102,7 +109,13 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) +@@ -102,7 +117,14 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); -- __smp_call_single_queue(cpu, &work->llnode); +- __smp_call_single_queue(cpu, &work->node.llist); + -+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ)) { -+ if (llist_add(&work->llnode, &per_cpu(lazy_list, cpu))) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { ++ if (llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) ++ /* && tick_nohz_tick_stopped_cpu(cpu) */ + arch_send_call_function_single_ipi(cpu); + } else { -+ __smp_call_single_queue(cpu, &work->llnode); ++ __smp_call_single_queue(cpu, &work->node.llist); + } } else { __irq_work_queue_local(work); } -@@ -120,9 +133,8 @@ bool irq_work_needs_cpu(void) +@@ -120,9 +142,8 @@ bool irq_work_needs_cpu(void) raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); @@ -9246,7 +8504,7 @@ index eca83965b631..8183d30e1bb1 100644 /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); -@@ -160,8 +172,12 @@ static void irq_work_run_list(struct llist_head *list) +@@ -165,8 +186,12 @@ static void irq_work_run_list(struct llist_head *list) struct irq_work *work, *tmp; struct llist_node *llnode; @@ -9260,7 +8518,7 @@ index eca83965b631..8183d30e1bb1 100644 if (llist_empty(list)) return; -@@ -177,7 +193,16 @@ static void irq_work_run_list(struct llist_head *list) +@@ -182,7 +207,16 @@ static void irq_work_run_list(struct llist_head *list) void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); @@ -9278,7 +8536,7 @@ index eca83965b631..8183d30e1bb1 100644 } EXPORT_SYMBOL_GPL(irq_work_run); -@@ -187,8 +212,17 @@ void irq_work_tick(void) +@@ -192,8 +226,17 @@ void irq_work_tick(void) if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); @@ -9297,10 +8555,10 @@ index eca83965b631..8183d30e1bb1 100644 /* * Synchronize against the irq_work @entry, ensures the entry is not diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c -index c19c0dad1ebe..c85754463de6 100644 +index aa919585c24b..01935bb729de 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c -@@ -978,7 +978,6 @@ void crash_kexec(struct pt_regs *regs) +@@ -977,7 +977,6 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ @@ -9338,6 +8596,54 @@ index 35859da8bd4f..dfff31ed644a 100644 #endif NULL }; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 1578973c5740..bb0602597ffd 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -243,6 +243,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); + + static int kthread(void *_create) + { ++ static const struct sched_param param = { .sched_priority = 0 }; + /* Copy data: it's on kthread's stack */ + struct kthread_create_info *create = _create; + int (*threadfn)(void *data) = create->threadfn; +@@ -273,6 +274,13 @@ static int kthread(void *_create) + init_completion(&self->parked); + current->vfork_done = &self->exited; + ++ /* ++ * The new thread inherited kthreadd's priority and CPU mask. Reset ++ * back to default in case they have been changed. ++ */ ++ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); ++ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); ++ + /* OK, tell user we're spawned, wait for stop or wakeup */ + __set_current_state(TASK_UNINTERRUPTIBLE); + create->result = current; +@@ -370,7 +378,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + } + task = create->result; + if (!IS_ERR(task)) { +- static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; + + /* +@@ -379,13 +386,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); +- /* +- * root may have changed our (kthreadd's) priority or CPU mask. +- * The kernel thread should not inherit these properties. +- */ +- sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); +- set_cpus_allowed_ptr(task, +- housekeeping_cpumask(HK_FLAG_KTHREAD)); + } + kfree(create); + return task; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6d11cfb9b41f..c7fbf737e16e 100644 --- a/kernel/locking/Makefile @@ -9378,10 +8684,10 @@ index 6d11cfb9b41f..c7fbf737e16e 100644 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index 2facbbd146ec..b870708af581 100644 +index ff0003146262..746ba441ed7a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c -@@ -4896,6 +4896,7 @@ static void check_flags(unsigned long flags) +@@ -5292,6 +5292,7 @@ static noinstr void check_flags(unsigned long flags) } } @@ -9389,7 +8695,7 @@ index 2facbbd146ec..b870708af581 100644 /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only -@@ -4910,6 +4911,7 @@ static void check_flags(unsigned long flags) +@@ -5306,6 +5307,7 @@ static noinstr void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } @@ -9399,10 +8705,10 @@ index 2facbbd146ec..b870708af581 100644 print_irqtrace_events(current); diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c new file mode 100644 -index 000000000000..35b06711997d +index 000000000000..2b849e6b9b4a --- /dev/null +++ b/kernel/locking/mutex-rt.c -@@ -0,0 +1,222 @@ +@@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Real-Time Preemption Support @@ -9470,6 +8776,7 @@ index 000000000000..35b06711997d +#include <linux/fs.h> +#include <linux/futex.h> +#include <linux/hrtimer.h> ++#include <linux/blkdev.h> + +#include "rtmutex_common.h" + @@ -9490,29 +8797,43 @@ index 000000000000..35b06711997d +} +EXPORT_SYMBOL(__mutex_do_init); + ++static int _mutex_lock_blk_flush(struct mutex *lock, int state) ++{ ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ return __rt_mutex_lock_state(&lock->lock, state); ++} ++ +void __lockfunc _mutex_lock(struct mutex *lock) +{ + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); -+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(_mutex_lock); + -+void __lockfunc _mutex_lock_io(struct mutex *lock) ++void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass) +{ + int token; + + token = io_schedule_prepare(); -+ _mutex_lock(lock); ++ ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); ++ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); ++ + io_schedule_finish(token); +} -+EXPORT_SYMBOL_GPL(_mutex_lock_io); ++EXPORT_SYMBOL_GPL(_mutex_lock_io_nested); + +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); -+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE); ++ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; @@ -9524,7 +8845,7 @@ index 000000000000..35b06711997d + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); -+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE); ++ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; @@ -9535,27 +8856,14 @@ index 000000000000..35b06711997d +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) +{ + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); -+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(_mutex_lock_nested); + -+void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ -+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); -+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); -+ -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL_GPL(_mutex_lock_io_nested); -+ +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ + mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_); -+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(_mutex_lock_nest_lock); + @@ -9564,7 +8872,7 @@ index 000000000000..35b06711997d + int ret; + + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); -+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE); ++ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; @@ -9576,7 +8884,7 @@ index 000000000000..35b06711997d + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); -+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE); ++ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; @@ -9790,7 +9098,7 @@ index fc549713bba3..659e93e256c6 100644 - debug_rt_mutex_print_deadlock(w); -} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index cfdd5b93264d..ef22e1b52f8c 100644 +index 2f8cd616d3b2..4ea87d6c9ab7 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,6 +8,11 @@ @@ -10492,15 +9800,15 @@ index cfdd5b93264d..ef22e1b52f8c 100644 + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); -+ + +- rt_mutex_init_waiter(&waiter); + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - -- rt_mutex_init_waiter(&waiter); ++ + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? @@ -10791,42 +10099,37 @@ index cfdd5b93264d..ef22e1b52f8c 100644 } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); -@@ -1535,35 +2035,31 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +@@ -1534,36 +2034,17 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) + return __rt_mutex_slowtrylock(lock); } - /** +-/** - * rt_mutex_timed_lock - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller -+ * rt_mutex_lock_killable - lock a rt_mutex killable - * - * @lock: the rt_mutex to be locked +- * +- * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal +- * +- * Returns: +- * 0 on success +- * -EINTR when interrupted by a signal - * -ETIMEDOUT when the timeout expired - */ +- */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) -+int __sched rt_mutex_lock_killable(struct rt_mutex *lock) ++int __sched __rt_mutex_trylock(struct rt_mutex *lock) { - int ret; - - might_sleep(); -+ return rt_mutex_lock_state(lock, 0, TASK_KILLABLE); -+} -+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); - +- - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_MIN_CHAINWALK, - rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); -+int __sched __rt_mutex_trylock(struct rt_mutex *lock) -+{ +#ifdef CONFIG_PREEMPT_RT + if (WARN_ON_ONCE(in_irq() || in_nmi())) +#else @@ -10841,7 +10144,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 /** * rt_mutex_trylock - try to lock a rt_mutex -@@ -1580,10 +2076,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) +@@ -1580,10 +2061,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) { int ret; @@ -10853,7 +10156,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 if (ret) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); -@@ -1591,6 +2084,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) +@@ -1591,6 +2069,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_trylock); @@ -10865,7 +10168,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 /** * rt_mutex_unlock - unlock a rt_mutex * -@@ -1599,16 +2097,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); +@@ -1599,16 +2082,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); void __sched rt_mutex_unlock(struct rt_mutex *lock) { mutex_release(&lock->dep_map, _RET_IP_); @@ -10886,7 +10189,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 { lockdep_assert_held(&lock->wait_lock); -@@ -1625,23 +2120,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, +@@ -1625,23 +2105,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, * avoid inversion prior to the wakeup. preempt_disable() * therein pairs with rt_mutex_postunlock(). */ @@ -10925,7 +10228,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 } /** -@@ -1655,9 +2162,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +@@ -1655,9 +2147,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) void rt_mutex_destroy(struct rt_mutex *lock) { WARN_ON(rt_mutex_is_locked(lock)); @@ -10935,7 +10238,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 } EXPORT_SYMBOL_GPL(rt_mutex_destroy); -@@ -1680,7 +2184,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, +@@ -1680,7 +2169,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, if (name && key) debug_rt_mutex_init(lock, name, key); } @@ -10944,7 +10247,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 /** * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a -@@ -1700,6 +2204,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, +@@ -1700,6 +2189,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { __rt_mutex_init(lock, NULL, NULL); @@ -10959,7 +10262,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } -@@ -1723,6 +2235,26 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, +@@ -1722,6 +2219,26 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock) rt_mutex_set_owner(lock, NULL); } @@ -10986,7 +10289,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 /** * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task * @lock: the rt_mutex to take -@@ -1753,6 +2285,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, +@@ -1752,6 +2269,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (try_to_take_rt_mutex(lock, task, NULL)) return 1; @@ -11021,7 +10324,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, RT_MUTEX_FULL_CHAINWALK); -@@ -1767,7 +2327,8 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, +@@ -1766,7 +2311,8 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } @@ -11031,7 +10334,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 return ret; } -@@ -1852,12 +2413,15 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, +@@ -1851,12 +2397,15 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); @@ -11048,7 +10351,7 @@ index cfdd5b93264d..ef22e1b52f8c 100644 raw_spin_unlock_irq(&lock->wait_lock); return ret; -@@ -1919,3 +2483,97 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, +@@ -1918,3 +2467,97 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, return cleanup; } @@ -11167,7 +10470,7 @@ index 732f96abf462..338ccd29119a 100644 enum rtmutex_chainwalk walk) { diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h -index d1d62f942be2..407f3da146cb 100644 +index ca6fb489007b..248a7d91583b 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -15,6 +15,7 @@ @@ -11192,7 +10495,7 @@ index d1d62f942be2..407f3da146cb 100644 u64 deadline; }; -@@ -130,12 +127,15 @@ enum rtmutex_chainwalk { +@@ -130,11 +127,14 @@ enum rtmutex_chainwalk { /* * PI-futex support (proxy locking functions, etc.): */ @@ -11202,14 +10505,13 @@ index d1d62f942be2..407f3da146cb 100644 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); - extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -@@ -153,9 +153,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l); +@@ -152,9 +152,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, @@ -11242,7 +10544,7 @@ index d1d62f942be2..407f3da146cb 100644 # include "rtmutex-debug.h" diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c new file mode 100644 -index 000000000000..4cd72a2968a6 +index 000000000000..3d2d1f14b513 --- /dev/null +++ b/kernel/locking/rwlock-rt.c @@ -0,0 +1,334 @@ @@ -11309,7 +10611,7 @@ index 000000000000..4cd72a2968a6 + lock->rtmutex.save_state = 1; +} + -+int __read_rt_trylock(struct rt_rw_lock *lock) ++static int __read_rt_trylock(struct rt_rw_lock *lock) +{ + int r, old; + @@ -11582,15 +10884,16 @@ index 000000000000..4cd72a2968a6 +EXPORT_SYMBOL(__rt_rwlock_init); diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c new file mode 100644 -index 000000000000..bca7a448206d +index 000000000000..274172d5bb3a --- /dev/null +++ b/kernel/locking/rwsem-rt.c -@@ -0,0 +1,292 @@ +@@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/rwsem.h> +#include <linux/sched/debug.h> +#include <linux/sched/signal.h> +#include <linux/export.h> ++#include <linux/blkdev.h> + +#include "rtmutex_common.h" + @@ -11675,6 +10978,13 @@ index 000000000000..bca7a448206d + if (__down_read_trylock(sem)) + return 0; + ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ + might_sleep(); + raw_spin_lock_irq(&m->wait_lock); + /* @@ -11744,6 +11054,17 @@ index 000000000000..bca7a448206d + WARN_ON_ONCE(ret); +} + ++int __down_read_interruptible(struct rw_semaphore *sem) ++{ ++ int ret; ++ ++ ret = __down_read_common(sem, TASK_INTERRUPTIBLE); ++ if (likely(!ret)) ++ return ret; ++ WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret); ++ return -EINTR; ++} ++ +int __down_read_killable(struct rw_semaphore *sem) +{ + int ret; @@ -11797,6 +11118,13 @@ index 000000000000..bca7a448206d + struct rt_mutex *m = &sem->rtmutex; + unsigned long flags; + ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ + /* Take the rtmutex as a first step */ + if (__rt_mutex_lock_state(m, state)) + return -EINTR; @@ -11879,7 +11207,7 @@ index 000000000000..bca7a448206d + __up_write_unlock(sem, WRITER_BIAS - 1, flags); +} diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index f11b9bd3431d..fce8a6e3fa7c 100644 +index ba67600c7b2c..084948b9e03f 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,6 +28,7 @@ @@ -11890,7 +11218,7 @@ index f11b9bd3431d..fce8a6e3fa7c 100644 #include "lock_events.h" /* -@@ -1482,6 +1483,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) +@@ -1343,6 +1344,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) if (tmp & RWSEM_FLAG_WAITERS) rwsem_downgrade_wake(sem); } @@ -11898,7 +11226,7 @@ index f11b9bd3431d..fce8a6e3fa7c 100644 /* * lock for reading -@@ -1617,7 +1619,9 @@ void down_read_non_owner(struct rw_semaphore *sem) +@@ -1506,7 +1508,9 @@ void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); __down_read(sem); @@ -11908,7 +11236,7 @@ index f11b9bd3431d..fce8a6e3fa7c 100644 } EXPORT_SYMBOL(down_read_non_owner); -@@ -1646,7 +1650,9 @@ EXPORT_SYMBOL(down_write_killable_nested); +@@ -1535,7 +1539,9 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { @@ -11986,11 +11314,105 @@ index b9d93087ee66..72e306e0e8a3 100644 } + +#endif +diff --git a/kernel/notifier.c b/kernel/notifier.c +index 1b019cbca594..c20782f07643 100644 +--- a/kernel/notifier.c ++++ b/kernel/notifier.c +@@ -142,9 +142,9 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, + unsigned long flags; + int ret; + +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + return ret; + } + EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); +@@ -164,9 +164,9 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, + unsigned long flags; + int ret; + +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_unregister(&nh->head, n); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + synchronize_rcu(); + return ret; + } +@@ -182,9 +182,9 @@ int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, + * Musn't use RCU; because then the notifier list can + * change between the up and down traversal. + */ +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + + return ret; + } diff --git a/kernel/panic.c b/kernel/panic.c -index aef8872ba843..d563542bc7eb 100644 +index 332736a72a58..a14e2f5a9f55 100644 --- a/kernel/panic.c +++ b/kernel/panic.c -@@ -247,7 +247,6 @@ void panic(const char *fmt, ...) +@@ -177,12 +177,28 @@ static void panic_print_sys_info(void) + void panic(const char *fmt, ...) + { + static char buf[1024]; ++ va_list args2; + va_list args; + long i, i_next = 0, len; + int state = 0; + int old_cpu, this_cpu; + bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + ++ console_verbose(); ++ pr_emerg("Kernel panic - not syncing:\n"); ++ va_start(args2, fmt); ++ va_copy(args, args2); ++ vprintk(fmt, args2); ++ va_end(args2); ++#ifdef CONFIG_DEBUG_BUGVERBOSE ++ /* ++ * Avoid nested stack-dumping if a panic occurs during oops processing ++ */ ++ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) ++ dump_stack(); ++#endif ++ pr_flush(1000, true); ++ + /* + * Disable local interrupts. This will prevent panic_smp_self_stop + * from deadlocking the first cpu that invokes the panic, since +@@ -213,24 +229,13 @@ void panic(const char *fmt, ...) + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) + panic_smp_self_stop(); + +- console_verbose(); + bust_spinlocks(1); +- va_start(args, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + +- pr_emerg("Kernel panic - not syncing: %s\n", buf); +-#ifdef CONFIG_DEBUG_BUGVERBOSE +- /* +- * Avoid nested stack-dumping if a panic occurs during oops processing +- */ +- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) +- dump_stack(); +-#endif +- + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left +@@ -247,7 +252,6 @@ void panic(const char *fmt, ...) * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { @@ -11998,7 +11420,7 @@ index aef8872ba843..d563542bc7eb 100644 __crash_kexec(NULL); /* -@@ -271,8 +270,6 @@ void panic(const char *fmt, ...) +@@ -271,8 +275,6 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); @@ -12007,7 +11429,7 @@ index aef8872ba843..d563542bc7eb 100644 kmsg_dump(KMSG_DUMP_PANIC); /* -@@ -542,9 +539,11 @@ static u64 oops_id; +@@ -542,9 +544,11 @@ static u64 oops_id; static int init_oops_id(void) { @@ -12019,19 +11441,27 @@ index aef8872ba843..d563542bc7eb 100644 oops_id++; return 0; +@@ -555,6 +559,7 @@ static void print_oops_end_marker(void) + { + init_oops_id(); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); ++ pr_flush(1000, true); + } + + /* diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile -index 4d052fc6bcde..59cb24e25f00 100644 +index eee3dc9b60a9..59cb24e25f00 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile -@@ -1,4 +1,4 @@ +@@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o -obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o -+obj-$(CONFIG_PRINTK) += printk_ringbuffer.o + obj-$(CONFIG_PRINTK) += printk_ringbuffer.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h deleted file mode 100644 -index 660f9a6bf73a..000000000000 +index 3a8fd491758c..000000000000 --- a/kernel/printk/internal.h +++ /dev/null @@ -1,74 +0,0 @@ @@ -12051,9 +11481,9 @@ index 660f9a6bf73a..000000000000 - -extern raw_spinlock_t logbuf_lock; - --__printf(5, 0) +-__printf(4, 0) -int vprintk_store(int facility, int level, -- const char *dict, size_t dictlen, +- const struct dev_printk_info *dev_info, - const char *fmt, va_list args); - -__printf(1, 0) int vprintk_default(const char *fmt, va_list args); @@ -12110,44 +11540,28 @@ index 660f9a6bf73a..000000000000 -static inline bool printk_percpu_data_ready(void) { return false; } -#endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c -index 9b75f6bfc333..78a277ea5c35 100644 +index 5a95c688621f..a5fc854977bb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c -@@ -44,9 +44,12 @@ +@@ -44,6 +44,9 @@ #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> +#include <linux/kthread.h> ++#include <linux/kdb.h> +#include <linux/clocksource.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> -+#include <linux/kdb.h> - - #include <linux/uaccess.h> - #include <asm/sections.h> -@@ -55,9 +58,9 @@ - #define CREATE_TRACE_POINTS - #include <trace/events/printk.h> - -+#include "printk_ringbuffer.h" +@@ -58,7 +61,6 @@ + #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" -#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ -@@ -77,6 +80,9 @@ EXPORT_SYMBOL(ignore_console_lock_warning); - int oops_in_progress; - EXPORT_SYMBOL(oops_in_progress); - -+/* Set to enable sync mode. Once set, it is never cleared. */ -+static bool sync_mode; -+ - /* - * console_sem protects the console_drivers list, and also - * provides serialisation for access to the entire console -@@ -224,19 +230,7 @@ static int nr_ext_console_drivers; +@@ -225,19 +227,7 @@ static int nr_ext_console_drivers; static int __down_trylock_console_sem(unsigned long ip) { @@ -12168,7 +11582,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; -@@ -245,13 +239,9 @@ static int __down_trylock_console_sem(unsigned long ip) +@@ -246,13 +236,9 @@ static int __down_trylock_console_sem(unsigned long ip) static void __up_console_sem(unsigned long ip) { @@ -12182,7 +11596,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 } #define up_console_sem() __up_console_sem(_RET_IP_) -@@ -265,11 +255,6 @@ static void __up_console_sem(unsigned long ip) +@@ -266,11 +252,6 @@ static void __up_console_sem(unsigned long ip) */ static int console_locked, console_suspended; @@ -12194,120 +11608,20 @@ index 9b75f6bfc333..78a277ea5c35 100644 /* * Array of consoles built from command line options (console=) */ -@@ -294,30 +279,22 @@ enum con_msg_format_flags { - static int console_msg_format = MSG_FORMAT_DEFAULT; - - /* -- * The printk log buffer consists of a chain of concatenated variable -- * length records. Every record starts with a record header, containing -- * the overall length of the record. -- * -- * The heads to the first and last entry in the buffer, as well as the -- * sequence numbers of these entries are maintained when messages are -- * stored. -+ * The printk log buffer consists of a sequenced collection of records, each -+ * containing variable length message text. Every record also contains its -+ * own meta-data (@info). - * -- * If the heads indicate available messages, the length in the header -- * tells the start next message. A length == 0 for the next message -- * indicates a wrap-around to the beginning of the buffer. -+ * Every record meta-data carries the timestamp in microseconds, as well as -+ * the standard userspace syslog level and syslog facility. The usual kernel -+ * messages use LOG_KERN; userspace-injected messages always carry a matching -+ * syslog facility, by default LOG_USER. The origin of every message can be -+ * reliably determined that way. - * -- * Every record carries the monotonic timestamp in microseconds, as well as -- * the standard userspace syslog level and syslog facility. The usual -- * kernel messages use LOG_KERN; userspace-injected messages always carry -- * a matching syslog facility, by default LOG_USER. The origin of every -- * message can be reliably determined that way. -+ * The human readable log message of a record is available in @text, the -+ * length of the message text in @text_len. The stored message is not -+ * terminated. - * -- * The human readable log message directly follows the message header. The -- * length of the message text is stored in the header, the stored message -- * is not terminated. -- * -- * Optionally, a message can carry a dictionary of properties (key/value pairs), -- * to provide userspace with a machine-readable message context. -+ * Optionally, a record can carry a dictionary of properties (key/value -+ * pairs), to provide userspace with a machine-readable message context. - * - * Examples for well-defined, commonly used property names are: - * DEVICE=b12:8 device identifier -@@ -327,25 +304,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; - * +sound:card0 subsystem:devname - * SUBSYSTEM=pci driver-core subsystem name - * -- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value -- * follows directly after a '=' character. Every property is terminated by -- * a '\0' character. The last property is not terminated. -- * -- * Example of a message structure: -- * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec -- * 0008 34 00 record is 52 bytes long -- * 000a 0b 00 text is 11 bytes long -- * 000c 1f 00 dictionary is 23 bytes long -- * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) -- * 0010 69 74 27 73 20 61 20 6c "it's a l" -- * 69 6e 65 "ine" -- * 001b 44 45 56 49 43 "DEVIC" -- * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" -- * 52 49 56 45 52 3d 62 75 "RIVER=bu" -- * 67 "g" -- * 0032 00 00 00 padding to next message header -- * -- * The 'struct printk_log' buffer header must never be directly exported to -+ * Valid characters in property names are [a-zA-Z0-9.-_]. Property names -+ * and values are terminated by a '\0' character. -+ * -+ * Example of record values: -+ * record.text_buf = "it's a line" (unterminated) -+ * record.info.seq = 56 -+ * record.info.ts_nsec = 36863 -+ * record.info.text_len = 11 -+ * record.info.facility = 0 (LOG_KERN) -+ * record.info.flags = 0 -+ * record.info.level = 3 (LOG_ERR) -+ * record.info.caller_id = 299 (task 299) -+ * record.info.dev_info.subsystem = "pci" (terminated) -+ * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) -+ * -+ * The 'struct printk_info' buffer must never be directly exported to - * userspace, it is a kernel-private implementation detail that might - * need to be changed in the future, when the requirements change. - * -@@ -365,82 +339,23 @@ enum log_flags { +@@ -355,61 +336,43 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; --struct printk_log { -- u64 ts_nsec; /* timestamp in nanoseconds */ -- u16 len; /* length of entire record */ -- u16 text_len; /* length of text buffer */ -- u16 dict_len; /* length of dictionary buffer */ -- u8 facility; /* syslog facility */ -- u8 flags:5; /* internal record flags */ -- u8 level:3; /* syslog level */ --#ifdef CONFIG_PRINTK_CALLER -- u32 caller_id; /* thread id or processor id */ --#endif --} --#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS --__packed __aligned(4) --#endif --; -- -/* - * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken - * within the scheduler's rq lock. It must be released before calling - * console_unlock() or anything else that might wake up a process. - */ -DEFINE_RAW_SPINLOCK(logbuf_lock); -- ++#ifdef CONFIG_PRINTK ++/* syslog_lock protects syslog_* variables and write access to clear_seq. */ ++static DEFINE_SPINLOCK(syslog_lock); + -/* - * Helper macros to lock/unlock logbuf_lock and switch between - * printk-safe/unsafe modes. @@ -12335,75 +11649,50 @@ index 9b75f6bfc333..78a277ea5c35 100644 - raw_spin_unlock(&logbuf_lock); \ - printk_safe_exit_irqrestore(flags); \ - } while (0) -+/* The syslog_lock protects syslog_* variables. */ -+static DEFINE_SPINLOCK(syslog_lock); -+#define syslog_lock_irq() spin_lock_irq(&syslog_lock) -+#define syslog_unlock_irq() spin_unlock_irq(&syslog_lock) -+#define syslog_lock_irqsave(flags) spin_lock_irqsave(&syslog_lock, flags) -+#define syslog_unlock_irqrestore(flags) spin_unlock_irqrestore(&syslog_lock, flags) ++/* Set to enable sync mode. Once set, it is never cleared. */ ++static bool sync_mode; - #ifdef CONFIG_PRINTK +-#ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); +/* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; --static u32 syslog_idx; static size_t syslog_partial; static bool syslog_time; --/* index and sequence number of the first record stored in the buffer */ --static u64 log_first_seq; --static u32 log_first_idx; -- --/* index and sequence number of the next record to store in the buffer */ --static u64 log_next_seq; --static u32 log_next_idx; -- -/* the next printk record to write to the console */ -static u64 console_seq; --static u32 console_idx; -static u64 exclusive_console_stop_seq; -- - /* the next printk record to read after the last 'clear' command */ +-static unsigned long console_dropped; ++struct latched_seq { ++ seqcount_latch_t latch; ++ u64 val[2]; ++}; + +-/* the next printk record to read after the last 'clear' command */ -static u64 clear_seq; --static u32 clear_idx; -+static atomic64_t clear_seq = ATOMIC64_INIT(0); ++/* ++ * The next printk record to read after the last 'clear' command. There are ++ * two copies (updated with seqcount_latch) so that reads can locklessly ++ * access a valid value. Writers are synchronized by @syslog_lock. ++ */ ++static struct latched_seq clear_seq = { ++ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), ++ .val[0] = 0, ++ .val[1] = 0, ++}; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 -@@ -453,13 +368,30 @@ static u32 clear_idx; - #define LOG_FACILITY(v) ((v) >> 3 & 0xff) - - /* record buffer */ --#define LOG_ALIGN __alignof__(struct printk_log) -+#define LOG_ALIGN __alignof__(unsigned long) - #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) - #define LOG_BUF_LEN_MAX (u32)(1 << 31) - static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); - static char *log_buf = __log_buf; - static u32 log_buf_len = __LOG_BUF_LEN; - -+/* -+ * Define the average message size. This only affects the number of -+ * descriptors that will be available. Underestimating is better than -+ * overestimating (too many available descriptors is better than not enough). -+ */ -+#define PRB_AVGBITS 5 /* 32 character average length */ -+ -+#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS -+#error CONFIG_LOG_BUF_SHIFT value too small. -+#endif -+_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, -+ PRB_AVGBITS, &__log_buf[0]); -+ -+static struct printk_ringbuffer printk_rb_dynamic; -+ -+static struct printk_ringbuffer *prb = &printk_rb_static; + #else + #define PREFIX_MAX 32 + #endif + - /* - * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before - * per_cpu_areas are initialised. This variable is set to true when -@@ -467,7 +399,7 @@ static u32 log_buf_len = __LOG_BUF_LEN; ++/* the maximum size allowed to be reserved for a record */ + #define LOG_LINE_MAX (1024 - PREFIX_MAX) + + #define LOG_LEVEL(v) ((v) & 0x07) +@@ -447,11 +410,36 @@ static struct printk_ringbuffer *prb = &printk_rb_static; */ static bool __printk_percpu_data_ready __read_mostly; @@ -12412,350 +11701,51 @@ index 9b75f6bfc333..78a277ea5c35 100644 { return __printk_percpu_data_ready; } -@@ -484,108 +416,6 @@ u32 log_buf_len_get(void) - return log_buf_len; - } --/* human readable text of the record */ --static char *log_text(const struct printk_log *msg) --{ -- return (char *)msg + sizeof(struct printk_log); --} -- --/* optional key/value pair dictionary attached to the record */ --static char *log_dict(const struct printk_log *msg) --{ -- return (char *)msg + sizeof(struct printk_log) + msg->text_len; --} -- --/* get record by index; idx must point to valid msg */ --static struct printk_log *log_from_idx(u32 idx) --{ -- struct printk_log *msg = (struct printk_log *)(log_buf + idx); -- -- /* -- * A length == 0 record is the end of buffer marker. Wrap around and -- * read the message at the start of the buffer. -- */ -- if (!msg->len) -- return (struct printk_log *)log_buf; -- return msg; --} -- --/* get next record; idx must point to valid msg */ --static u32 log_next(u32 idx) --{ -- struct printk_log *msg = (struct printk_log *)(log_buf + idx); -- -- /* length == 0 indicates the end of the buffer; wrap */ -- /* -- * A length == 0 record is the end of buffer marker. Wrap around and -- * read the message at the start of the buffer as *this* one, and -- * return the one after that. -- */ -- if (!msg->len) { -- msg = (struct printk_log *)log_buf; -- return msg->len; -- } -- return idx + msg->len; --} -- --/* -- * Check whether there is enough free space for the given message. -- * -- * The same values of first_idx and next_idx mean that the buffer -- * is either empty or full. -- * -- * If the buffer is empty, we must respect the position of the indexes. -- * They cannot be reset to the beginning of the buffer. -- */ --static int logbuf_has_space(u32 msg_size, bool empty) --{ -- u32 free; -- -- if (log_next_idx > log_first_idx || empty) -- free = max(log_buf_len - log_next_idx, log_first_idx); -- else -- free = log_first_idx - log_next_idx; -- -- /* -- * We need space also for an empty header that signalizes wrapping -- * of the buffer. -- */ -- return free >= msg_size + sizeof(struct printk_log); --} -- --static int log_make_free_space(u32 msg_size) --{ -- while (log_first_seq < log_next_seq && -- !logbuf_has_space(msg_size, false)) { -- /* drop old messages until we have enough contiguous space */ -- log_first_idx = log_next(log_first_idx); -- log_first_seq++; -- } -- -- if (clear_seq < log_first_seq) { -- clear_seq = log_first_seq; -- clear_idx = log_first_idx; -- } -- -- /* sequence numbers are equal, so the log buffer is empty */ -- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) -- return 0; -- -- return -ENOMEM; --} -- --/* compute the message size including the padding bytes */ --static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) --{ -- u32 size; -- -- size = sizeof(struct printk_log) + text_len + dict_len; -- *pad_len = (-size) & (LOG_ALIGN - 1); -- size += *pad_len; -- -- return size; --} -- - /* - * Define how much of the log buffer we could take at maximum. The value - * must be greater than two. Note that only half of the buffer is available -@@ -594,84 +424,23 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) - #define MAX_LOG_TAKE_PART 4 - static const char trunc_msg[] = "<truncated>"; - --static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, -- u16 *dict_len, u32 *pad_len) -+static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) - { - /* - * The message should not take the whole buffer. Otherwise, it might - * get removed too soon. - */ - u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; -+ - if (*text_len > max_text_len) - *text_len = max_text_len; -- /* enable the warning message */ -- *trunc_msg_len = strlen(trunc_msg); -- /* disable the "dict" completely */ -- *dict_len = 0; -- /* compute the size again, count also the warning message */ -- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); --} -- --/* insert record into the buffer, discard old ones, update heads */ --static int log_store(u32 caller_id, int facility, int level, -- enum log_flags flags, u64 ts_nsec, -- const char *dict, u16 dict_len, -- const char *text, u16 text_len) --{ -- struct printk_log *msg; -- u32 size, pad_len; -- u16 trunc_msg_len = 0; -- -- /* number of '\0' padding bytes to next message */ -- size = msg_used_size(text_len, dict_len, &pad_len); -- -- if (log_make_free_space(size)) { -- /* truncate the message if it is too long for empty buffer */ -- size = truncate_msg(&text_len, &trunc_msg_len, -- &dict_len, &pad_len); -- /* survive when the log buffer is too small for trunc_msg */ -- if (log_make_free_space(size)) -- return 0; -- } -- -- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { -- /* -- * This message + an additional empty header does not fit -- * at the end of the buffer. Add an empty header with len == 0 -- * to signify a wrap around. -- */ -- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); -- log_next_idx = 0; -- } - -- /* fill message */ -- msg = (struct printk_log *)(log_buf + log_next_idx); -- memcpy(log_text(msg), text, text_len); -- msg->text_len = text_len; -- if (trunc_msg_len) { -- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); -- msg->text_len += trunc_msg_len; -- } -- memcpy(log_dict(msg), dict, dict_len); -- msg->dict_len = dict_len; -- msg->facility = facility; -- msg->level = level & 7; -- msg->flags = flags & 0x1f; -- if (ts_nsec > 0) -- msg->ts_nsec = ts_nsec; -+ /* enable the warning message (if there is room) */ -+ *trunc_msg_len = strlen(trunc_msg); -+ if (*text_len >= *trunc_msg_len) -+ *text_len -= *trunc_msg_len; - else -- msg->ts_nsec = local_clock(); --#ifdef CONFIG_PRINTK_CALLER -- msg->caller_id = caller_id; --#endif -- memset(log_dict(msg) + dict_len, 0, pad_len); -- msg->len = size; -- -- /* insert message */ -- log_next_idx += msg->len; -- log_next_seq++; -- -- return msg->text_len; -+ *trunc_msg_len = 0; - } - - int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); -@@ -723,13 +492,13 @@ static void append_char(char **pp, char *e, char c) - *(*pp)++ = c; - } - --static ssize_t msg_print_ext_header(char *buf, size_t size, -- struct printk_log *msg, u64 seq) -+static ssize_t info_print_ext_header(char *buf, size_t size, -+ struct printk_info *info) - { -- u64 ts_usec = msg->ts_nsec; -+ u64 ts_usec = info->ts_nsec; - char caller[20]; - #ifdef CONFIG_PRINTK_CALLER -- u32 id = msg->caller_id; -+ u32 id = info->caller_id; - - snprintf(caller, sizeof(caller), ",caller=%c%u", - id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); -@@ -740,13 +509,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, - do_div(ts_usec, 1000); - - return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", -- (msg->facility << 3) | msg->level, seq, ts_usec, -- msg->flags & LOG_CONT ? 'c' : '-', caller); -+ (info->facility << 3) | info->level, info->seq, -+ ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); - } - --static ssize_t msg_print_ext_body(char *buf, size_t size, -- char *dict, size_t dict_len, -- char *text, size_t text_len) -+static ssize_t msg_add_ext_text(char *buf, size_t size, -+ const char *text, size_t text_len, -+ unsigned char endc) - { - char *p = buf, *e = buf + size; - size_t i; -@@ -760,45 +529,56 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, - else - append_char(&p, e, c); - } -- append_char(&p, e, '\n'); -+ append_char(&p, e, endc); - -- if (dict_len) { -- bool line = true; -+ return p - buf; -+} - -- for (i = 0; i < dict_len; i++) { -- unsigned char c = dict[i]; -+static ssize_t msg_add_dict_text(char *buf, size_t size, -+ const char *key, const char *val) ++/* Must be called under syslog_lock. */ ++static void latched_seq_write(struct latched_seq *ls, u64 val) +{ -+ size_t val_len = strlen(val); -+ ssize_t len; - -- if (line) { -- append_char(&p, e, ' '); -- line = false; -- } -+ if (!val_len) -+ return 0; - -- if (c == '\0') { -- append_char(&p, e, '\n'); -- line = true; -- continue; -- } -+ len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ -+ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); -+ len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); - -- if (c < ' ' || c >= 127 || c == '\\') { -- p += scnprintf(p, e - p, "\\x%02x", c); -- continue; -- } -+ return len; ++ raw_write_seqcount_latch(&ls->latch); ++ ls->val[0] = val; ++ raw_write_seqcount_latch(&ls->latch); ++ ls->val[1] = val; +} - -- append_char(&p, e, c); -- } -- append_char(&p, e, '\n'); -- } -+static ssize_t msg_print_ext_body(char *buf, size_t size, -+ char *text, size_t text_len, -+ struct dev_printk_info *dev_info) ++ ++/* Can be called from any context. */ ++static u64 latched_seq_read_nolock(struct latched_seq *ls) +{ -+ ssize_t len; - -- return p - buf; -+ len = msg_add_ext_text(buf, size, text, text_len, '\n'); ++ unsigned int seq; ++ unsigned int idx; ++ u64 val; + -+ if (!dev_info) -+ goto out; ++ do { ++ seq = raw_read_seqcount_latch(&ls->latch); ++ idx = seq & 0x1; ++ val = ls->val[idx]; ++ } while (read_seqcount_latch_retry(&ls->latch, seq)); + -+ len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", -+ dev_info->subsystem); -+ len += msg_add_dict_text(buf + len, size - len, "DEVICE", -+ dev_info->device); -+out: -+ return len; - } ++ return val; ++} ++ + /* Return log buffer address */ + char *log_buf_addr_get(void) + { +@@ -619,7 +607,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { - u64 seq; -- u32 idx; +- u64 seq; ++ atomic64_t seq; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; -+ -+ struct printk_info info; -+ char text_buf[CONSOLE_EXT_LOG_MAX]; -+ struct printk_record record; - }; - - static __printf(3, 4) __cold -@@ -808,7 +588,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...) - int r; - - va_start(args, fmt); -- r = vprintk_emit(facility, level, NULL, 0, fmt, args); -+ r = vprintk_emit(facility, level, NULL, fmt, args); - va_end(args); - - return r; -@@ -881,7 +661,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) - { - struct devkmsg_user *user = file->private_data; -- struct printk_log *msg; -+ struct printk_record *r = &user->record; - size_t len; - ssize_t ret; - -@@ -892,41 +672,31 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, +@@ -719,27 +707,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; - logbuf_lock_irq(); -- while (user->seq == log_next_seq) { -+ if (!prb_read_valid(prb, user->seq, r)) { +- if (!prb_read_valid(prb, user->seq, r)) { ++ if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - logbuf_unlock_irq(); @@ -12764,42 +11754,34 @@ index 9b75f6bfc333..78a277ea5c35 100644 - logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, -- user->seq != log_next_seq); -+ prb_read_valid(prb, user->seq, r)); +- prb_read_valid(prb, user->seq, r)); ++ prb_read_valid(prb, atomic64_read(&user->seq), r)); if (ret) goto out; - logbuf_lock_irq(); } -- if (user->seq < log_first_seq) { -+ if (user->seq < prb_first_valid_seq(prb)) { +- if (user->seq < prb_first_valid_seq(prb)) { ++ if (r->info->seq != atomic64_read(&user->seq)) { /* our last seen message is gone, return error and reset */ -- user->idx = log_first_idx; -- user->seq = log_first_seq; -+ user->seq = prb_first_valid_seq(prb); +- user->seq = prb_first_valid_seq(prb); ++ atomic64_set(&user->seq, r->info->seq); ret = -EPIPE; - logbuf_unlock_irq(); goto out; } -- msg = log_from_idx(user->idx); -- len = msg_print_ext_header(user->buf, sizeof(user->buf), -- msg, user->seq); -+ len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); - len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, -- log_dict(msg), msg->dict_len, -- log_text(msg), msg->text_len); -+ &r->text_buf[0], r->info->text_len, -+ &r->info->dev_info); - -- user->idx = log_next(user->idx); -- user->seq++; +@@ -748,8 +731,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); + +- user->seq = r->info->seq + 1; - logbuf_unlock_irq(); -+ user->seq = r->info->seq + 1; ++ atomic64_set(&user->seq, r->info->seq + 1); if (len > count) { ret = -EINVAL; -@@ -961,12 +731,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) +@@ -784,11 +766,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; @@ -12807,25 +11789,22 @@ index 9b75f6bfc333..78a277ea5c35 100644 switch (whence) { case SEEK_SET: /* the first record */ -- user->idx = log_first_idx; -- user->seq = log_first_seq; -+ user->seq = prb_first_valid_seq(prb); +- user->seq = prb_first_valid_seq(prb); ++ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* -@@ -974,18 +742,15 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) +@@ -796,22 +777,22 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ -- user->idx = clear_idx; - user->seq = clear_seq; -+ user->seq = atomic64_read(&clear_seq); ++ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ -- user->idx = log_next_idx; -- user->seq = log_next_seq; -+ user->seq = prb_next_seq(prb); +- user->seq = prb_next_seq(prb); ++ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; @@ -12834,16 +11813,23 @@ index 9b75f6bfc333..78a277ea5c35 100644 return ret; } -@@ -999,15 +764,13 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + { + struct devkmsg_user *user = file->private_data; ++ struct printk_info info; + __poll_t ret = 0; + + if (!user) +@@ -819,15 +800,13 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - logbuf_lock_irq(); -- if (user->seq < log_next_seq) { -+ if (prb_read_valid(prb, user->seq, NULL)) { +- if (prb_read_valid(prb, user->seq, NULL)) { ++ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ -- if (user->seq < log_first_seq) -+ if (user->seq < prb_first_valid_seq(prb)) +- if (user->seq < prb_first_valid_seq(prb)) ++ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; @@ -12852,98 +11838,28 @@ index 9b75f6bfc333..78a277ea5c35 100644 return ret; } -@@ -1037,10 +800,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) - - mutex_init(&user->lock); +@@ -860,9 +839,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf)); - logbuf_lock_irq(); -- user->idx = log_first_idx; -- user->seq = log_first_seq; +- user->seq = prb_first_valid_seq(prb); - logbuf_unlock_irq(); -+ prb_rec_init_rd(&user->record, &user->info, -+ &user->text_buf[0], sizeof(user->text_buf)); -+ -+ user->seq = prb_first_valid_seq(prb); ++ atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; -@@ -1080,23 +843,61 @@ const struct file_operations kmsg_fops = { - */ - void log_buf_vmcoreinfo_setup(void) - { -- VMCOREINFO_SYMBOL(log_buf); -- VMCOREINFO_SYMBOL(log_buf_len); -- VMCOREINFO_SYMBOL(log_first_idx); -- VMCOREINFO_SYMBOL(clear_idx); -- VMCOREINFO_SYMBOL(log_next_idx); -+ struct dev_printk_info *dev_info = NULL; -+ -+ VMCOREINFO_SYMBOL(prb); -+ VMCOREINFO_SYMBOL(printk_rb_static); -+ VMCOREINFO_SYMBOL(clear_seq); -+ - /* -- * Export struct printk_log size and field offsets. User space tools can -+ * Export struct size and field offsets. User space tools can - * parse it and detect any changes to structure down the line. - */ -- VMCOREINFO_STRUCT_SIZE(printk_log); -- VMCOREINFO_OFFSET(printk_log, ts_nsec); -- VMCOREINFO_OFFSET(printk_log, len); -- VMCOREINFO_OFFSET(printk_log, text_len); -- VMCOREINFO_OFFSET(printk_log, dict_len); --#ifdef CONFIG_PRINTK_CALLER -- VMCOREINFO_OFFSET(printk_log, caller_id); --#endif +@@ -954,6 +931,9 @@ void log_buf_vmcoreinfo_setup(void) + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); + -+ VMCOREINFO_SIZE(atomic64_t); -+ VMCOREINFO_TYPE_OFFSET(atomic64_t, counter); -+ -+ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); -+ VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); -+ VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); -+ VMCOREINFO_OFFSET(printk_ringbuffer, fail); -+ -+ VMCOREINFO_STRUCT_SIZE(prb_desc_ring); -+ VMCOREINFO_OFFSET(prb_desc_ring, count_bits); -+ VMCOREINFO_OFFSET(prb_desc_ring, descs); -+ VMCOREINFO_OFFSET(prb_desc_ring, infos); -+ VMCOREINFO_OFFSET(prb_desc_ring, head_id); -+ VMCOREINFO_OFFSET(prb_desc_ring, tail_id); -+ -+ VMCOREINFO_STRUCT_SIZE(prb_desc); -+ VMCOREINFO_OFFSET(prb_desc, state_var); -+ VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); -+ -+ VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); -+ VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); -+ VMCOREINFO_OFFSET(prb_data_blk_lpos, next); -+ -+ VMCOREINFO_STRUCT_SIZE(printk_info); -+ VMCOREINFO_OFFSET(printk_info, seq); -+ VMCOREINFO_OFFSET(printk_info, ts_nsec); -+ VMCOREINFO_OFFSET(printk_info, text_len); -+ VMCOREINFO_OFFSET(printk_info, caller_id); -+ VMCOREINFO_OFFSET(printk_info, dev_info); -+ -+ VMCOREINFO_STRUCT_SIZE(dev_printk_info); -+ VMCOREINFO_OFFSET(dev_printk_info, subsystem); -+ VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); -+ VMCOREINFO_OFFSET(dev_printk_info, device); -+ VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); -+ -+ VMCOREINFO_STRUCT_SIZE(prb_data_ring); -+ VMCOREINFO_OFFSET(prb_data_ring, size_bits); -+ VMCOREINFO_OFFSET(prb_data_ring, data); -+ VMCOREINFO_OFFSET(prb_data_ring, head_lpos); -+ VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); -+ -+ VMCOREINFO_SIZE(atomic_long_t); -+ VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); ++ VMCOREINFO_STRUCT_SIZE(latched_seq); ++ VMCOREINFO_OFFSET(latched_seq, val); } #endif -@@ -1168,17 +969,48 @@ static inline void log_buf_add_cpu(void) {} +@@ -1025,9 +1005,6 @@ static inline void log_buf_add_cpu(void) {} static void __init set_percpu_data_ready(void) { @@ -12953,448 +11869,158 @@ index 9b75f6bfc333..78a277ea5c35 100644 __printk_percpu_data_ready = true; } -+static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, -+ struct printk_record *r) -+{ -+ struct prb_reserved_entry e; -+ struct printk_record dest_r; -+ -+ prb_rec_init_wr(&dest_r, r->info->text_len); -+ -+ if (!prb_reserve(&e, rb, &dest_r)) -+ return 0; -+ -+ memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); -+ dest_r.info->text_len = r->info->text_len; -+ dest_r.info->facility = r->info->facility; -+ dest_r.info->level = r->info->level; -+ dest_r.info->flags = r->info->flags; -+ dest_r.info->ts_nsec = r->info->ts_nsec; -+ dest_r.info->caller_id = r->info->caller_id; -+ memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); -+ -+ prb_final_commit(&e); -+ -+ return prb_record_text_space(&e); -+} -+ -+static char setup_text_buf[LOG_LINE_MAX] __initdata; -+ - void __init setup_log_buf(int early) - { +@@ -1067,7 +1044,6 @@ void __init setup_log_buf(int early) + struct printk_record r; + size_t new_descs_size; + size_t new_infos_size; - unsigned long flags; -+ struct printk_info *new_infos; -+ unsigned int new_descs_count; -+ struct prb_desc *new_descs; -+ struct printk_info info; -+ struct printk_record r; -+ size_t new_descs_size; -+ size_t new_infos_size; char *new_log_buf; unsigned int free; -+ u64 seq; - - /* - * Some archs call setup_log_buf() multiple times - first is very -@@ -1197,24 +1029,71 @@ void __init setup_log_buf(int early) - if (!new_log_buf_len) - return; - -+ new_descs_count = new_log_buf_len >> PRB_AVGBITS; -+ if (new_descs_count == 0) { -+ pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); -+ return; -+ } -+ - new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); - if (unlikely(!new_log_buf)) { -- pr_err("log_buf_len: %lu bytes not available\n", -- new_log_buf_len); -+ pr_err("log_buf_len: %lu text bytes not available\n", -+ new_log_buf_len); - return; - } + u64 seq; +@@ -1125,8 +1101,6 @@ void __init setup_log_buf(int early) + new_descs, ilog2(new_descs_count), + new_infos); -- logbuf_lock_irqsave(flags); -+ new_descs_size = new_descs_count * sizeof(struct prb_desc); -+ new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); -+ if (unlikely(!new_descs)) { -+ pr_err("log_buf_len: %zu desc bytes not available\n", -+ new_descs_size); -+ goto err_free_log_buf; -+ } -+ -+ new_infos_size = new_descs_count * sizeof(struct printk_info); -+ new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); -+ if (unlikely(!new_infos)) { -+ pr_err("log_buf_len: %zu info bytes not available\n", -+ new_infos_size); -+ goto err_free_descs; -+ } -+ -+ prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); -+ -+ prb_init(&printk_rb_dynamic, -+ new_log_buf, ilog2(new_log_buf_len), -+ new_descs, ilog2(new_descs_count), -+ new_infos); -+ +- printk_safe_enter_irqsave(flags); +- log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; -- free = __LOG_BUF_LEN - log_next_idx; -- memcpy(log_buf, __log_buf, __LOG_BUF_LEN); -- logbuf_unlock_irqrestore(flags); -+ -+ free = __LOG_BUF_LEN; -+ prb_for_each_record(0, &printk_rb_static, seq, &r) -+ free -= add_to_rb(&printk_rb_dynamic, &r); -+ -+ /* -+ * This is early enough that everything is still running on the -+ * boot CPU and interrupts are disabled. So no new messages will -+ * appear during the transition to the dynamic buffer. -+ */ -+ prb = &printk_rb_dynamic; -+ -+ if (seq != prb_next_seq(&printk_rb_static)) { -+ pr_err("dropped %llu messages\n", -+ prb_next_seq(&printk_rb_static) - seq); -+ } - - pr_info("log_buf_len: %u bytes\n", log_buf_len); - pr_info("early log buf free: %u(%u%%)\n", - free, (free * 100) / __LOG_BUF_LEN); -+ return; -+ -+err_free_descs: -+ memblock_free(__pa(new_descs), new_descs_size); -+err_free_log_buf: -+ memblock_free(__pa(new_log_buf), new_log_buf_len); - } - - static bool __read_mostly ignore_loglevel; -@@ -1321,18 +1200,18 @@ static size_t print_caller(u32 id, char *buf) - #define print_caller(id, buf) 0 - #endif - --static size_t print_prefix(const struct printk_log *msg, bool syslog, -- bool time, char *buf) -+static size_t info_print_prefix(const struct printk_info *info, bool syslog, -+ bool time, char *buf) - { - size_t len = 0; - - if (syslog) -- len = print_syslog((msg->facility << 3) | msg->level, buf); -+ len = print_syslog((info->facility << 3) | info->level, buf); - - if (time) -- len += print_time(msg->ts_nsec, buf + len); -+ len += print_time(info->ts_nsec, buf + len); - -- len += print_caller(msg->caller_id, buf + len); -+ len += print_caller(info->caller_id, buf + len); +@@ -1142,8 +1116,6 @@ void __init setup_log_buf(int early) + */ + prb = &printk_rb_dynamic; - if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { - buf[len++] = ' '; -@@ -1342,72 +1221,150 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, - return len; +- printk_safe_exit_irqrestore(flags); +- + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); +@@ -1420,6 +1392,50 @@ static size_t get_record_print_text_size(struct printk_info *info, + return ((prefix_len * line_count) + info->text_len + 1); } --static size_t msg_print_text(const struct printk_log *msg, bool syslog, -- bool time, char *buf, size_t size) +/* -+ * Prepare the record for printing. The text is shifted within the given -+ * buffer to avoid a need for another one. The following operations are -+ * done: -+ * -+ * - Add prefix for each line. -+ * - Add the trailing newline that has been removed in vprintk_store(). -+ * - Drop truncated lines that do not longer fit into the buffer. ++ * Beginning with @start_seq, find the first record where it and all following ++ * records up to (but not including) @max_seq fit into @size. + * -+ * Return: The length of the updated/prepared text, including the added -+ * prefixes and the newline. The dropped line(s) are not counted. ++ * @max_seq is simply an upper bound and does not need to exist. If the caller ++ * does not require an upper bound, -1 can be used for @max_seq. + */ -+static size_t record_print_text(struct printk_record *r, bool syslog, -+ bool time) - { -- const char *text = log_text(msg); -- size_t text_size = msg->text_len; -- size_t len = 0; -+ size_t text_len = r->info->text_len; -+ size_t buf_size = r->text_buf_size; -+ char *text = r->text_buf; - char prefix[PREFIX_MAX]; -- const size_t prefix_len = print_prefix(msg, syslog, time, prefix); -+ bool truncated = false; -+ size_t prefix_len; -+ size_t line_len; ++static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, ++ bool syslog, bool time) ++{ ++ struct printk_info info; ++ unsigned int line_count; + size_t len = 0; -+ char *next; - -- do { -- const char *next = memchr(text, '\n', text_size); -- size_t text_len; ++ u64 seq; ++ ++ /* Determine the size of the records up to @max_seq. */ ++ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { ++ if (info.seq >= max_seq) ++ break; ++ len += get_record_print_text_size(&info, line_count, syslog, time); ++ } ++ + /* -+ * If the message was truncated because the buffer was not large -+ * enough, treat the available text as if it were the full text. ++ * Adjust the upper bound for the next loop to avoid subtracting ++ * lengths that were never added. + */ -+ if (text_len > buf_size) -+ text_len = buf_size; - -+ prefix_len = info_print_prefix(r->info, syslog, time, prefix); ++ if (seq < max_seq) ++ max_seq = seq; + + /* -+ * @text_len: bytes of unprocessed text -+ * @line_len: bytes of current line _without_ newline -+ * @text: pointer to beginning of current line -+ * @len: number of bytes prepared in r->text_buf ++ * Move first record forward until length fits into the buffer. Ignore ++ * newest messages that were not counted in the above cycle. Messages ++ * might appear and get lost in the meantime. This is a best effort ++ * that prevents an infinite loop that could occur with a retry. + */ -+ for (;;) { -+ next = memchr(text, '\n', text_len); - if (next) { -- text_len = next - text; -- next++; -- text_size -= next - text; -+ line_len = next - text; - } else { -- text_len = text_size; -+ /* Drop truncated line(s). */ -+ if (truncated) -+ break; -+ line_len = text_len; - } - -- if (buf) { -- if (prefix_len + text_len + 1 >= size - len) -+ /* -+ * Truncate the text if there is not enough space to add the -+ * prefix and a trailing newline. -+ */ -+ if (len + prefix_len + text_len + 1 > buf_size) { -+ /* Drop even the current line if no space. */ -+ if (len + prefix_len + line_len + 1 > buf_size) - break; - -- memcpy(buf + len, prefix, prefix_len); -- len += prefix_len; -- memcpy(buf + len, text, text_len); -- len += text_len; -- buf[len++] = '\n'; -- } else { -- /* SYSLOG_ACTION_* buffer size only calculation */ -- len += prefix_len + text_len + 1; -+ text_len = buf_size - len - prefix_len - 1; -+ truncated = true; - } - -- text = next; -- } while (text); -+ memmove(text + prefix_len, text, text_len); -+ memcpy(text, prefix, prefix_len); -+ -+ len += prefix_len + line_len + 1; -+ -+ if (text_len == line_len) { -+ /* -+ * Add the trailing newline removed in -+ * vprintk_store(). -+ */ -+ text[prefix_len + line_len] = '\n'; ++ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { ++ if (len <= size || info.seq >= max_seq) + break; -+ } -+ -+ /* -+ * Advance beyond the added prefix and the related line with -+ * its newline. -+ */ -+ text += prefix_len + line_len + 1; -+ -+ /* -+ * The remaining text has only decreased by the line with its -+ * newline. -+ * -+ * Note that @text_len can become zero. It happens when @text -+ * ended with a newline (either due to truncation or the -+ * original string ending with "\n\n"). The loop is correctly -+ * repeated and (if not truncated) an empty line with a prefix -+ * will be prepared. -+ */ -+ text_len -= line_len + 1; ++ len -= get_record_print_text_size(&info, line_count, syslog, time); + } - - return len; - } - -+static size_t get_record_print_text_size(struct printk_info *info, -+ unsigned int line_count, -+ bool syslog, bool time) -+{ -+ char prefix[PREFIX_MAX]; -+ size_t prefix_len; + -+ prefix_len = info_print_prefix(info, syslog, time, prefix); -+ -+ /* -+ * Each line will be preceded with a prefix. The intermediate -+ * newlines are already within the text, but a final trailing -+ * newline will be added. -+ */ -+ return ((prefix_len * line_count) + info->text_len + 1); ++ return seq; +} + static int syslog_print(char __user *buf, int size) { -+ struct printk_info info; -+ struct printk_record r; + struct printk_info info; +@@ -1427,19 +1443,19 @@ static int syslog_print(char __user *buf, int size) char *text; -- struct printk_log *msg; int len = 0; - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) return -ENOMEM; -+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); -+ +- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + while (size > 0) { size_t n; size_t skip; - logbuf_lock_irq(); -- if (syslog_seq < log_first_seq) { -- /* messages are gone, move to first one */ -- syslog_seq = log_first_seq; -- syslog_idx = log_first_idx; -- syslog_partial = 0; -- } -- if (syslog_seq == log_next_seq) { ++ spin_lock_irq(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) { - logbuf_unlock_irq(); -+ syslog_lock_irq(); -+ if (!prb_read_valid(prb, syslog_seq, &r)) { -+ syslog_unlock_irq(); ++ spin_unlock_irq(&syslog_lock); break; } -+ if (r.info->seq != syslog_seq) { -+ /* message is gone, move to next valid one */ -+ syslog_seq = r.info->seq; -+ syslog_partial = 0; -+ } - - /* - * To keep reading/counting partial line consistent, -@@ -1417,13 +1374,10 @@ static int syslog_print(char __user *buf, int size) - syslog_time = printk_time; - - skip = syslog_partial; -- msg = log_from_idx(syslog_idx); -- n = msg_print_text(msg, true, syslog_time, text, -- LOG_LINE_MAX + PREFIX_MAX); -+ n = record_print_text(&r, true, syslog_time); - if (n - syslog_partial <= size) { - /* message fits into buffer, move forward */ -- syslog_idx = log_next(syslog_idx); -- syslog_seq++; -+ syslog_seq = r.info->seq + 1; - n -= syslog_partial; - syslog_partial = 0; - } else if (!len){ -@@ -1432,7 +1386,7 @@ static int syslog_print(char __user *buf, int size) + if (r.info->seq != syslog_seq) { +@@ -1468,7 +1484,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - logbuf_unlock_irq(); -+ syslog_unlock_irq(); ++ spin_unlock_irq(&syslog_lock); if (!n) break; -@@ -1454,11 +1408,14 @@ static int syslog_print(char __user *buf, int size) - +@@ -1491,34 +1507,25 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { -+ struct printk_info info; -+ unsigned int line_count; -+ struct printk_record r; -+ u64 newest_seq; -+ u64 clr_seq; + struct printk_info info; +- unsigned int line_count; + struct printk_record r; char *text; int len = 0; -- u64 next_seq; u64 seq; -- u32 idx; bool time; - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); -@@ -1466,63 +1423,58 @@ static int syslog_print_all(char __user *buf, int size, bool clear) +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) return -ENOMEM; time = printk_time; - logbuf_lock_irq(); -+ clr_seq = atomic64_read(&clear_seq); -+ /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ -- seq = clear_seq; -- idx = clear_idx; -- while (seq < log_next_seq) { -- struct printk_log *msg = log_from_idx(idx); - -- len += msg_print_text(msg, true, time, NULL, 0); -- idx = log_next(idx); -- seq++; -- } -+ prb_for_each_info(clr_seq, prb, seq, &info, &line_count) -+ len += get_record_print_text_size(&info, line_count, true, time); - +- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) +- len += get_record_print_text_size(&info, line_count, true, time); +- - /* move first record forward until length fits into the buffer */ -- seq = clear_seq; -- idx = clear_idx; -- while (len > size && seq < log_next_seq) { -- struct printk_log *msg = log_from_idx(idx); -+ /* -+ * Keep track of the latest in case new records are coming in fast -+ * and overwriting the older records. -+ */ -+ newest_seq = seq; - -- len -= msg_print_text(msg, true, time, NULL, 0); -- idx = log_next(idx); -- seq++; -+ /* -+ * Move first record forward until length fits into the buffer. This -+ * is a best effort attempt. If @newest_seq is reached because the -+ * ringbuffer is wrapping too fast, just start filling the buffer -+ * from there. -+ */ -+ prb_for_each_info(clr_seq, prb, seq, &info, &line_count) { -+ if (len <= size || info.seq > newest_seq) -+ break; -+ len -= get_record_print_text_size(&info, line_count, true, time); - } +- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { +- if (len <= size) +- break; +- len -= get_record_print_text_size(&info, line_count, true, time); +- } ++ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, ++ size, true, time); -- /* last message fitting into this dump */ -- next_seq = log_next_seq; -+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); +- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); len = 0; -- while (len >= 0 && seq < next_seq) { -- struct printk_log *msg = log_from_idx(idx); -- int textlen = msg_print_text(msg, true, time, text, -- LOG_LINE_MAX + PREFIX_MAX); -+ prb_for_each_record(seq, prb, seq, &r) { -+ int textlen; - -- idx = log_next(idx); -- seq++; -+ textlen = record_print_text(&r, true, time); -+ -+ if (len + textlen > size) { -+ seq--; -+ break; -+ } + prb_for_each_record(seq, prb, seq, &r) { +@@ -1531,20 +1538,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + break; + } - logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) @@ -13403,146 +12029,117 @@ index 9b75f6bfc333..78a277ea5c35 100644 len += textlen; - logbuf_lock_irq(); -- if (seq < log_first_seq) { -- /* messages are gone, move to next one */ -- seq = log_first_seq; -- idx = log_first_idx; -- } -+ if (len < 0) -+ break; + if (len < 0) + break; } -- if (clear) { -- clear_seq = log_next_seq; -- clear_idx = log_next_idx; -- } +- if (clear) +- clear_seq = seq; - logbuf_unlock_irq(); -+ if (clear) -+ atomic64_set(&clear_seq, seq); ++ if (clear) { ++ spin_lock_irq(&syslog_lock); ++ latched_seq_write(&clear_seq, seq); ++ spin_unlock_irq(&syslog_lock); ++ } kfree(text); return len; -@@ -1530,10 +1482,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) +@@ -1552,13 +1559,26 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { - logbuf_lock_irq(); -- clear_seq = log_next_seq; -- clear_idx = log_next_idx; +- clear_seq = prb_next_seq(prb); - logbuf_unlock_irq(); -+ atomic64_set(&clear_seq, prb_next_seq(prb)); ++ spin_lock_irq(&syslog_lock); ++ latched_seq_write(&clear_seq, prb_next_seq(prb)); ++ spin_unlock_irq(&syslog_lock); ++} ++ ++/* Return a consistent copy of @syslog_seq. */ ++static u64 read_syslog_seq_irq(void) ++{ ++ u64 seq; ++ ++ spin_lock_irq(&syslog_lock); ++ seq = syslog_seq; ++ spin_unlock_irq(&syslog_lock); ++ ++ return seq; } int do_syslog(int type, char __user *buf, int len, int source) -@@ -1541,6 +1490,7 @@ int do_syslog(int type, char __user *buf, int len, int source) + { ++ struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; -+ u64 seq; - - error = check_syslog_permissions(type, source); - if (error) -@@ -1558,8 +1508,11 @@ int do_syslog(int type, char __user *buf, int len, int source) +@@ -1579,8 +1599,9 @@ int do_syslog(int type, char __user *buf, int len, int source) return 0; if (!access_ok(buf, len)) return -EFAULT; -+ syslog_lock_irq(); -+ seq = syslog_seq; -+ syslog_unlock_irq(); ++ error = wait_event_interruptible(log_wait, -- syslog_seq != log_next_seq); -+ prb_read_valid(prb, seq, NULL)); +- prb_read_valid(prb, syslog_seq, NULL)); ++ prb_read_valid(prb, read_syslog_seq_irq(), NULL)); if (error) return error; error = syslog_print(buf, len); -@@ -1567,7 +1520,7 @@ int do_syslog(int type, char __user *buf, int len, int source) - /* Read/clear last kernel messages */ - case SYSLOG_ACTION_READ_CLEAR: - clear = true; -- /* FALL THRU */ -+ fallthrough; - /* Read last kernel messages */ - case SYSLOG_ACTION_READ_ALL: - if (!buf || len < 0) -@@ -1607,11 +1560,10 @@ int do_syslog(int type, char __user *buf, int len, int source) +@@ -1628,10 +1649,15 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - logbuf_lock_irq(); -- if (syslog_seq < log_first_seq) { -+ syslog_lock_irq(); -+ if (syslog_seq < prb_first_valid_seq(prb)) { +- if (syslog_seq < prb_first_valid_seq(prb)) { ++ spin_lock_irq(&syslog_lock); ++ if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { ++ /* No unread messages. */ ++ spin_unlock_irq(&syslog_lock); ++ return 0; ++ } ++ if (info.seq != syslog_seq) { /* messages are gone, move to first one */ -- syslog_seq = log_first_seq; -- syslog_idx = log_first_idx; -+ syslog_seq = prb_first_valid_seq(prb); +- syslog_seq = prb_first_valid_seq(prb); ++ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { -@@ -1620,24 +1572,22 @@ int do_syslog(int type, char __user *buf, int len, int source) - * for pending data, not the size; return the count of - * records, not the length. - */ -- error = log_next_seq - syslog_seq; -+ error = prb_next_seq(prb) - syslog_seq; +@@ -1643,7 +1669,6 @@ int do_syslog(int type, char __user *buf, int len, int source) + error = prb_next_seq(prb) - syslog_seq; } else { -- u64 seq = syslog_seq; -- u32 idx = syslog_idx; bool time = syslog_partial ? syslog_time : printk_time; -- -- while (seq < log_next_seq) { -- struct printk_log *msg = log_from_idx(idx); -- -- error += msg_print_text(msg, true, time, NULL, -- 0); -+ struct printk_info info; -+ unsigned int line_count; -+ u64 seq; -+ -+ prb_for_each_info(syslog_seq, prb, seq, &info, -+ &line_count) { -+ error += get_record_print_text_size(&info, line_count, -+ true, time); - time = printk_time; -- idx = log_next(idx); -- seq++; +- struct printk_info info; + unsigned int line_count; + u64 seq; + +@@ -1655,7 +1680,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - logbuf_unlock_irq(); -+ syslog_unlock_irq(); ++ spin_unlock_irq(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: -@@ -1657,178 +1607,134 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) +@@ -1674,202 +1699,172 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } - /* +-/* - * Special console_lock variants that help to reduce the risk of soft-lockups. - * They allow to pass console_lock to another printk() call using a busy wait. -+ * The per-cpu sprint buffers are used with interrupts disabled, so each CPU -+ * only requires 2 buffers: for non-NMI and NMI contexts. Recursive printk() -+ * calls are handled by the global sprint buffers. - */ -+#define SPRINT_CTX_DEPTH 2 - +- */ +- -#ifdef CONFIG_LOCKDEP -static struct lockdep_map console_owner_dep_map = { - .name = "console_owner" -+/* Static sprint buffers for early boot (only 1 CPU) and recursion. */ -+static DECLARE_BITMAP(sprint_global_buffer_map, SPRINT_CTX_DEPTH); -+static char sprint_global_buffer[SPRINT_CTX_DEPTH][PREFIX_MAX + LOG_LINE_MAX]; -+ -+struct sprint_buffers { -+ char buf[SPRINT_CTX_DEPTH][PREFIX_MAX + LOG_LINE_MAX]; -+ atomic_t index; - }; +-}; -#endif - +- -static DEFINE_RAW_SPINLOCK(console_owner_lock); -static struct task_struct *console_owner; -static bool console_waiter; -+static DEFINE_PER_CPU(struct sprint_buffers, percpu_sprint_buffers); ++int printk_delay_msec __read_mostly; -/** - * console_lock_spinning_enable - mark beginning of code where another @@ -13552,31 +12149,25 @@ index 9b75f6bfc333..78a277ea5c35 100644 - * the section where the console_lock owner can not sleep, because - * there may be a waiter spinning (like a spinlock). Also it must be - * ready to hand over the lock at the end of the section. -+/* -+ * Acquire an unused buffer, returning its index. If no buffer is -+ * available, @count is returned. - */ +- */ -static void console_lock_spinning_enable(void) -+static int _get_sprint_buf(unsigned long *map, int count) ++static inline void printk_delay(int level) { - raw_spin_lock(&console_owner_lock); - console_owner = current; - raw_spin_unlock(&console_owner_lock); -+ int index; -+ -+ do { -+ index = find_first_zero_bit(map, count); -+ if (index == count) -+ break; -+ /* -+ * Guarantee map changes are ordered for the other CPUs. -+ * Pairs with clear_bit() in _put_sprint_buf(). -+ */ -+ } while (test_and_set_bit(index, map)); ++ boot_delay_msec(level); - /* The waiter may spin on us after setting console_owner */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); -+ return index; ++ if (unlikely(printk_delay_msec)) { ++ int m = printk_delay_msec; ++ ++ while (m--) { ++ mdelay(1); ++ touch_nmi_watchdog(); ++ } ++ } } -/** @@ -13595,35 +12186,69 @@ index 9b75f6bfc333..78a277ea5c35 100644 - * Return: 1 if the lock rights were passed, 0 otherwise. - */ -static int console_lock_spinning_disable_and_check(void) -+/* Mark the buffer @index as unused. */ -+static void _put_sprint_buf(unsigned long *map, unsigned int count, unsigned int index) ++static bool kernel_sync_mode(void) { - int waiter; -- ++ return (oops_in_progress || sync_mode); ++} + - raw_spin_lock(&console_owner_lock); - waiter = READ_ONCE(console_waiter); - console_owner = NULL; - raw_spin_unlock(&console_owner_lock); -- ++static bool console_can_sync(struct console *con) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ return true; ++ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ return true; ++ if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ return true; ++ return false; ++} + - if (!waiter) { - spin_release(&console_owner_dep_map, _THIS_IP_); - return 0; - } -- ++static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ con->write_atomic(con, text, text_len); ++ else if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ con->write_atomic(con, text, text_len); ++ else if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ con->write(con, text, text_len); ++ else ++ return false; + - /* The waiter is now free to continue */ - WRITE_ONCE(console_waiter, false); -- ++ return true; ++} + - spin_release(&console_owner_dep_map, _THIS_IP_); -- - /* ++static bool have_atomic_console(void) ++{ ++ struct console *con; + +- /* - * Hand off console_lock to waiter. The waiter will perform - * the up(). After this, the waiter is the console_lock owner. -+ * Guarantee map changes are ordered for the other CPUs. -+ * Pairs with test_and_set_bit() in _get_sprint_buf(). - */ +- */ - mutex_release(&console_lock_dep_map, _THIS_IP_); - return 1; -+ clear_bit(index, map); ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ if (con->write_atomic) ++ return true; ++ } ++ return false; } -/** @@ -13633,31 +12258,27 @@ index 9b75f6bfc333..78a277ea5c35 100644 - * owner is running in specially marked sections. It means that - * the current owner is running and cannot reschedule until it - * is ready to lose the lock. -+/* -+ * Get a buffer sized PREFIX_MAX+LOG_LINE_MAX for sprinting. On success, @id -+ * is set and interrupts are disabled. @id is used to put back the buffer. - * +- * - * Return: 1 if we got the lock, 0 othrewise -+ * @id is non-negative for per-cpu buffers, negative for global buffers. - */ +- */ -static int console_trylock_spinning(void) -+static char *get_sprint_buf(int *id, unsigned long *flags) ++static bool print_sync(struct console *con, u64 *seq) { - struct task_struct *owner = NULL; - bool waiter; - bool spin = false; - unsigned long flags; -+ struct sprint_buffers *bufs; -+ unsigned int index; -+ unsigned int cpu; ++ struct printk_info info; ++ struct printk_record r; ++ size_t text_len; - if (console_trylock()) - return 1; -+ local_irq_save(*flags); -+ cpu = get_cpu(); ++ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); - printk_safe_enter_irqsave(flags); -+ if (printk_percpu_data_ready()) { ++ if (!prb_read_valid(prb, *seq, &r)) ++ return false; - raw_spin_lock(&console_owner_lock); - owner = READ_ONCE(console_owner); @@ -13665,22 +12286,11 @@ index 9b75f6bfc333..78a277ea5c35 100644 - if (!waiter && owner && owner != current) { - WRITE_ONCE(console_waiter, true); - spin = true; -+ /* -+ * First try with per-cpu pool. Note that the last -+ * buffer is reserved for NMI context. -+ */ -+ bufs = per_cpu_ptr(&percpu_sprint_buffers, cpu); -+ index = atomic_read(&bufs->index); -+ if (index < (SPRINT_CTX_DEPTH - 1) || -+ (in_nmi() && index < SPRINT_CTX_DEPTH)) { -+ atomic_set(&bufs->index, index + 1); -+ *id = cpu; -+ return &bufs->buf[index][0]; -+ } - } +- } - raw_spin_unlock(&console_owner_lock); ++ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); - /* +- /* - * If there is an active printk() writing to the - * consoles, instead of having it write our data too, - * see if we can offload that load from the active @@ -13688,21 +12298,13 @@ index 9b75f6bfc333..78a277ea5c35 100644 - * Go into a spin only if there isn't already a waiter - * spinning, and there is an active printer, and - * that active printer isn't us (recursive printk?). -+ * Fallback to global pool. -+ * -+ * The global pool will only ever be used if per-cpu data is not ready -+ * yet or printk recurses. Recursion will not occur unless printk is -+ * having internal issues. - */ +- */ - if (!spin) { - printk_safe_exit_irqrestore(flags); - return 0; -+ index = _get_sprint_buf(sprint_global_buffer_map, SPRINT_CTX_DEPTH); -+ if (index != SPRINT_CTX_DEPTH) { -+ /* Convert to global buffer representation. */ -+ *id = -index - 1; -+ return &sprint_global_buffer[index][0]; - } +- } ++ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) ++ return false; - /* We spin waiting for the owner to release us */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); @@ -13710,7 +12312,8 @@ index 9b75f6bfc333..78a277ea5c35 100644 - while (READ_ONCE(console_waiter)) - cpu_relax(); - spin_release(&console_owner_dep_map, _THIS_IP_); -- ++ *seq = r.info->seq; + - printk_safe_exit_irqrestore(flags); - /* - * The owner passed the console lock to us. @@ -13719,12 +12322,16 @@ index 9b75f6bfc333..78a277ea5c35 100644 - * complain. - */ - mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); -- ++ touch_softlockup_watchdog_sync(); ++ clocksource_touch_watchdog(); ++ rcu_cpu_stall_reset(); ++ touch_nmi_watchdog(); + - return 1; -+ /* Failed to get a buffer. */ -+ put_cpu(); -+ local_irq_restore(*flags); -+ return NULL; ++ if (text_len) ++ printk_delay(r.info->level); ++ ++ return true; } -/* @@ -13734,15 +12341,44 @@ index 9b75f6bfc333..78a277ea5c35 100644 - */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) -+/* Put back an sprint buffer and restore interrupts. */ -+static void put_sprint_buf(int id, unsigned long flags) ++static void print_sync_until(struct console *con, u64 seq) { +- static char dropped_text[64]; +- size_t dropped_len = 0; - struct console *con; -- ++ unsigned int flags; ++ u64 printk_seq; + - trace_console_rcuidle(text, len); -+ struct sprint_buffers *bufs; -+ unsigned int index; -+ unsigned int cpu; ++ console_atomic_lock(&flags); ++ for (;;) { ++ printk_seq = atomic64_read(&con->printk_seq); ++ if (printk_seq >= seq) ++ break; ++ if (!print_sync(con, &printk_seq)) ++ break; ++ atomic64_set(&con->printk_seq, printk_seq + 1); ++ } ++ console_atomic_unlock(flags); ++} + +- if (!console_drivers) +- return; ++#ifdef CONFIG_PRINTK_NMI ++#define NUM_RECURSION_CTX 2 ++#else ++#define NUM_RECURSION_CTX 1 ++#endif + +- if (console_dropped) { +- dropped_len = snprintf(dropped_text, sizeof(dropped_text), +- "** %lu printk messages dropped **\n", +- console_dropped); +- console_dropped = 0; +- } ++struct printk_recursion { ++ char count[NUM_RECURSION_CTX]; ++}; - for_each_console(con) { - if (exclusive_console && con != exclusive_console) @@ -13756,350 +12392,181 @@ index 9b75f6bfc333..78a277ea5c35 100644 - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); -- else +- else { +- if (dropped_len) +- con->write(con, dropped_text, dropped_len); - con->write(con, text, len); -+ if (id >= 0) { -+ cpu = id; -+ bufs = per_cpu_ptr(&percpu_sprint_buffers, cpu); -+ index = atomic_read(&bufs->index); -+ atomic_set(&bufs->index, index - 1); +- } ++static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion); ++static char printk_recursion_count[NUM_RECURSION_CTX]; ++ ++static char *printk_recursion_counter(void) ++{ ++ struct printk_recursion *rec; ++ char *count; ++ ++ if (!printk_percpu_data_ready()) { ++ count = &printk_recursion_count[0]; + } else { -+ /* Convert from global buffer representation. */ -+ index = -id - 1; -+ _put_sprint_buf(sprint_global_buffer_map, -+ SPRINT_CTX_DEPTH, index); - } ++ rec = this_cpu_ptr(&percpu_printk_recursion); + -+ put_cpu(); -+ local_irq_restore(flags); - } ++ count = &rec->count[0]; + } +-} - int printk_delay_msec __read_mostly; +-int printk_delay_msec __read_mostly; ++#ifdef CONFIG_PRINTK_NMI ++ if (in_nmi()) ++ count++; ++#endif ++ ++ return count; ++} -static inline void printk_delay(void) -+static inline void printk_delay(int level) ++static bool printk_enter_irqsave(unsigned long *flags) { -+ boot_delay_msec(level); -+ - if (unlikely(printk_delay_msec)) { - int m = printk_delay_msec; +- if (unlikely(printk_delay_msec)) { +- int m = printk_delay_msec; ++ char *count; -@@ -1839,115 +1745,155 @@ static inline void printk_delay(void) +- while (m--) { +- mdelay(1); +- touch_nmi_watchdog(); +- } ++ local_irq_save(*flags); ++ count = printk_recursion_counter(); ++ /* Only 1 level of recursion allowed. */ ++ if (*count > 1) { ++ local_irq_restore(*flags); ++ return false; } ++ (*count)++; ++ ++ return true; ++} ++ ++static void printk_exit_irqrestore(unsigned long flags) ++{ ++ char *count; ++ ++ count = printk_recursion_counter(); ++ (*count)--; ++ local_irq_restore(flags); } --static inline u32 printk_caller_id(void) -+static bool kernel_sync_mode(void) - { -- return in_task() ? task_pid_nr(current) : -- 0x80000000 + raw_smp_processor_id(); -+ return (oops_in_progress || sync_mode); - } - --/* -- * Continuation lines are buffered, and not committed to the record buffer -- * until the line is complete, or a race forces it. The line fragments -- * though, are printed immediately to the consoles to ensure everything has -- * reached the console in case of a kernel crash. -- */ --static struct cont { -- char buf[LOG_LINE_MAX]; -- size_t len; /* length == 0 means unused buffer */ -- u32 caller_id; /* printk_caller_id() of first print */ -- u64 ts_nsec; /* time of first print */ -- u8 level; /* log level of first message */ -- u8 facility; /* log facility of first message */ -- enum log_flags flags; /* prefix, newline flags */ --} cont; -- --static void cont_flush(void) --{ -- if (cont.len == 0) -- return; -- -- log_store(cont.caller_id, cont.facility, cont.level, cont.flags, -- cont.ts_nsec, NULL, 0, cont.buf, cont.len); -- cont.len = 0; -+static bool console_can_sync(struct console *con) -+{ -+ if (!(con->flags & CON_ENABLED)) -+ return false; -+ if (con->write_atomic && kernel_sync_mode()) -+ return true; -+ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) -+ return true; -+ if (con->write && (con->flags & CON_BOOT) && !con->thread) -+ return true; -+ return false; + static inline u32 printk_caller_id(void) +@@ -1950,20 +1945,24 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl } --static bool cont_add(u32 caller_id, int facility, int level, -- enum log_flags flags, const char *text, size_t len) -+static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) + __printf(4, 0) +-int vprintk_store(int facility, int level, +- const struct dev_printk_info *dev_info, +- const char *fmt, va_list args) ++static int vprintk_store(int facility, int level, ++ const struct dev_printk_info *dev_info, ++ const char *fmt, va_list args) { -- /* If the line gets too long, split it up in separate records. */ -- if (cont.len + len > sizeof(cont.buf)) { -- cont_flush(); -+ if (!(con->flags & CON_ENABLED)) - return false; -- } -+ if (con->write_atomic && kernel_sync_mode()) -+ con->write_atomic(con, text, text_len); -+ else if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) -+ con->write_atomic(con, text, text_len); -+ else if (con->write && (con->flags & CON_BOOT) && !con->thread) -+ con->write(con, text, text_len); -+ else -+ return false; -+ -+ return true; -+} + const u32 caller_id = printk_caller_id(); + struct prb_reserved_entry e; + enum log_flags lflags = 0; ++ bool final_commit = false; + struct printk_record r; ++ unsigned long irqflags; + u16 trunc_msg_len = 0; + char prefix_buf[8]; + u16 reserve_size; + va_list args2; + u16 text_len; ++ int ret = 0; + u64 ts_nsec; ++ u64 seq; + + /* + * Since the duration of printk() can vary depending on the message +@@ -1973,6 +1972,9 @@ int vprintk_store(int facility, int level, + */ + ts_nsec = local_clock(); + ++ if (!printk_enter_irqsave(&irqflags)) ++ return 0; + -+static bool any_console_can_sync(void) -+{ -+ struct console *con; + /* + * The sprintf needs to come first since the syslog prefix might be + * passed in as a parameter. An extra byte must be reserved so that +@@ -1999,6 +2001,7 @@ int vprintk_store(int facility, int level, + if (lflags & LOG_CONT) { + prb_rec_init_wr(&r, reserve_size); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { ++ seq = r.info->seq; + text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, + facility, &lflags, fmt, args); + r.info->text_len += text_len; +@@ -2006,11 +2009,13 @@ int vprintk_store(int facility, int level, + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); ++ final_commit = true; + } else { + prb_commit(&e); + } -- if (!cont.len) { -- cont.facility = facility; -- cont.level = level; -- cont.caller_id = caller_id; -- cont.ts_nsec = local_clock(); -- cont.flags = flags; -+ for_each_console(con) { -+ if (console_can_sync(con)) -+ return true; +- return text_len; ++ ret = text_len; ++ goto out; + } } -+ return false; -+} -- memcpy(cont.buf + cont.len, text, len); -- cont.len += len; -+static bool have_atomic_console(void) -+{ -+ struct console *con; - -- // The original flags come from the first line, -- // but later continuations can add a newline. -- if (flags & LOG_NEWLINE) { -- cont.flags |= LOG_NEWLINE; -- cont_flush(); -+ for_each_console(con) { -+ if (!(con->flags & CON_ENABLED)) -+ continue; -+ if (con->write_atomic) -+ return true; - } -+ return false; -+} -+ -+static bool print_sync(struct console *con, char *buf, size_t buf_size, u64 *seq) -+{ -+ struct printk_info info; -+ struct printk_record r; -+ size_t text_len; -+ -+ prb_rec_init_rd(&r, &info, buf, buf_size); -+ -+ if (!prb_read_valid(prb, *seq, &r)) -+ return false; -+ -+ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); -+ -+ if (!call_sync_console_driver(con, buf, text_len)) -+ return false; -+ -+ *seq = r.info->seq; -+ -+ touch_softlockup_watchdog_sync(); -+ clocksource_touch_watchdog(); -+ rcu_cpu_stall_reset(); -+ touch_nmi_watchdog(); -+ -+ if (text_len) -+ printk_delay(r.info->level); - - return true; - } - --static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) -+static void print_sync_until(u64 seq, struct console *con, char *buf, size_t buf_size) - { -- const u32 caller_id = printk_caller_id(); -+ unsigned int flags; -+ u64 printk_seq; - -- /* -- * If an earlier line was buffered, and we're a continuation -- * write from the same context, try to add it to the buffer. -- */ -- if (cont.len) { -- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { -- if (cont_add(caller_id, facility, level, lflags, text, text_len)) -- return text_len; -+ if (!con) { -+ for_each_console(con) { -+ if (console_can_sync(con)) -+ print_sync_until(seq, con, buf, buf_size); - } -- /* Otherwise, make sure it's flushed */ -- cont_flush(); -+ return; - } +@@ -2026,9 +2031,11 @@ int vprintk_store(int facility, int level, -- /* Skip empty continuation lines that couldn't be added - they just flush */ -- if (!text_len && (lflags & LOG_CONT)) -- return 0; -- -- /* If it doesn't end in a newline, try to buffer the current line */ -- if (!(lflags & LOG_NEWLINE)) { -- if (cont_add(caller_id, facility, level, lflags, text, text_len)) -- return text_len; -+ console_atomic_lock(&flags); -+ for (;;) { -+ printk_seq = atomic64_read(&con->printk_seq); -+ if (printk_seq >= seq) -+ break; -+ if (!print_sync(con, buf, buf_size, &printk_seq)) -+ break; -+ atomic64_set(&con->printk_seq, printk_seq + 1); + prb_rec_init_wr(&r, reserve_size + trunc_msg_len); + if (!prb_reserve(&e, prb, &r)) +- return 0; ++ goto out; } -+ console_atomic_unlock(flags); -+} - -- /* Store it in the record log */ -- return log_store(caller_id, facility, level, lflags, 0, -- dict, dictlen, text, text_len); -+static inline u32 printk_caller_id(void) -+{ -+ return in_task() ? task_pid_nr(current) : -+ 0x80000000 + raw_smp_processor_id(); - } - --/* Must be called under logbuf_lock. */ --int vprintk_store(int facility, int level, -- const char *dict, size_t dictlen, -- const char *fmt, va_list args) -+__printf(4, 0) -+static int vprintk_store(int facility, int level, -+ const struct dev_printk_info *dev_info, -+ const char *fmt, va_list args) - { -- static char textbuf[LOG_LINE_MAX]; -- char *text = textbuf; -- size_t text_len; -+ const u32 caller_id = printk_caller_id(); -+ struct prb_reserved_entry e; - enum log_flags lflags = 0; -+ bool final_commit = false; -+ unsigned long irqflags; -+ struct printk_record r; -+ u16 trunc_msg_len = 0; -+ int sprint_id; -+ u16 text_len; -+ u64 ts_nsec; -+ int ret = 0; -+ char *text; -+ u64 seq; -+ -+ ts_nsec = local_clock(); -+ -+ /* No buffer is available if printk has recursed too much. */ -+ text = get_sprint_buf(&sprint_id, &irqflags); -+ if (!text) -+ return 0; - /* - * The printf needs to come first; we need the syslog - * prefix which might be passed-in as a parameter. - */ -- text_len = vscnprintf(text, sizeof(textbuf), fmt, args); -+ text_len = vscnprintf(text, LOG_LINE_MAX, fmt, args); - - /* mark and strip a trailing newline */ - if (text_len && text[text_len-1] == '\n') { -@@ -1977,76 +1923,115 @@ int vprintk_store(int facility, int level, - if (level == LOGLEVEL_DEFAULT) - level = default_message_loglevel; - -- if (dict) -+ if (dev_info) - lflags |= LOG_NEWLINE; - -- return log_output(facility, level, lflags, -- dict, dictlen, text, text_len); -+ if (lflags & LOG_CONT) { -+ prb_rec_init_wr(&r, text_len); -+ if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { -+ seq = r.info->seq; -+ memcpy(&r.text_buf[r.info->text_len], text, text_len); -+ r.info->text_len += text_len; -+ if (lflags & LOG_NEWLINE) { -+ r.info->flags |= LOG_NEWLINE; -+ prb_final_commit(&e); -+ final_commit = true; -+ } else { -+ prb_commit(&e); -+ } -+ ret = text_len; -+ goto out; -+ } -+ } -+ -+ /* Store it in the record log */ -+ -+ prb_rec_init_wr(&r, text_len); -+ -+ if (!prb_reserve(&e, prb, &r)) { -+ /* truncate the message if it is too long for empty buffer */ -+ truncate_msg(&text_len, &trunc_msg_len); -+ prb_rec_init_wr(&r, text_len + trunc_msg_len); -+ /* survive when the log buffer is too small for trunc_msg */ -+ if (!prb_reserve(&e, prb, &r)) -+ goto out; -+ } -+ + seq = r.info->seq; + -+ /* fill message */ -+ memcpy(&r.text_buf[0], text, text_len); -+ if (trunc_msg_len) -+ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); -+ r.info->text_len = text_len + trunc_msg_len; -+ r.info->facility = facility; -+ r.info->level = level & 7; -+ r.info->flags = lflags & 0x1f; -+ r.info->ts_nsec = ts_nsec; -+ r.info->caller_id = caller_id; -+ if (dev_info) -+ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); -+ -+ /* insert message */ -+ if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) { -+ prb_commit(&e); + /* fill message */ + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); + if (trunc_msg_len) +@@ -2043,12 +2050,27 @@ int vprintk_store(int facility, int level, + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + + /* A message without a trailing newline can be continued. */ +- if (!(lflags & LOG_NEWLINE)) ++ if (!(lflags & LOG_NEWLINE)) { + prb_commit(&e); +- else + } else { -+ prb_final_commit(&e); + prb_final_commit(&e); + final_commit = true; + } + + ret = text_len + trunc_msg_len; +out: + /* only the kernel may perform synchronous printing */ -+ if (facility == 0 && final_commit && any_console_can_sync()) -+ print_sync_until(seq + 1, NULL, text, PREFIX_MAX + LOG_LINE_MAX); ++ if (facility == 0 && final_commit) { ++ struct console *con; + -+ put_sprint_buf(sprint_id, irqflags); ++ for_each_console(con) { ++ if (console_can_sync(con)) ++ print_sync_until(con, seq + 1); ++ } ++ } + +- return (text_len + trunc_msg_len); ++ printk_exit_irqrestore(irqflags); + return ret; } asmlinkage int vprintk_emit(int facility, int level, -- const char *dict, size_t dictlen, -+ const struct dev_printk_info *dev_info, +@@ -2056,59 +2078,43 @@ asmlinkage int vprintk_emit(int facility, int level, const char *fmt, va_list args) { int printed_len; -- bool in_sched = false, pending_output; +- bool in_sched = false; - unsigned long flags; -- u64 curr_log_seq; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -14110,19 +12577,16 @@ index 9b75f6bfc333..78a277ea5c35 100644 level = LOGLEVEL_DEFAULT; - in_sched = true; - } - +- - boot_delay_msec(level); - printk_delay(); -- -- /* This stops the holder of console_sem just where we want him */ -- logbuf_lock_irqsave(flags); -- curr_log_seq = log_next_seq; -- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); -- pending_output = (curr_log_seq != log_next_seq); -- logbuf_unlock_irqrestore(flags); + +- printk_safe_enter_irqsave(flags); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); +- printk_safe_exit_irqrestore(flags); - - /* If called from the scheduler, we can not call up(). */ -- if (!in_sched && pending_output) { +- if (!in_sched) { - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to @@ -14138,17 +12602,14 @@ index 9b75f6bfc333..78a277ea5c35 100644 - console_unlock(); - preempt_enable(); - } -+ printed_len = vprintk_store(facility, level, dev_info, fmt, args); -- if (pending_output) -- wake_up_klogd(); -+ wake_up_klogd(); + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -asmlinkage int vprintk(const char *fmt, va_list args) -+ __printf(1, 0) ++__printf(1, 0) +static int vprintk_default(const char *fmt, va_list args) { - return vprintk_func(fmt, args); @@ -14160,182 +12621,323 @@ index 9b75f6bfc333..78a277ea5c35 100644 +__printf(1, 0) +static int vprintk_func(const char *fmt, va_list args) { -- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); +- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); +#ifdef CONFIG_KGDB_KDB + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) + return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +#endif + return vprintk_default(fmt, args); - } --EXPORT_SYMBOL_GPL(vprintk_default); ++} + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + return vprintk_func(fmt, args); -+} + } +-EXPORT_SYMBOL_GPL(vprintk_default); +EXPORT_SYMBOL(vprintk); /** * printk - print a kernel message -@@ -2088,30 +2073,26 @@ EXPORT_SYMBOL(printk); - #define PREFIX_MAX 0 - #define printk_time false +@@ -2144,42 +2150,162 @@ asmlinkage __visible int printk(const char *fmt, ...) + } + EXPORT_SYMBOL(printk); -+#define prb_read_valid(rb, seq, r) false -+#define prb_first_valid_seq(rb) 0 -+ - static u64 syslog_seq; --static u32 syslog_idx; --static u64 console_seq; --static u32 console_idx; --static u64 exclusive_console_stop_seq; --static u64 log_first_seq; --static u32 log_first_idx; --static u64 log_next_seq; --static char *log_text(const struct printk_log *msg) { return NULL; } --static char *log_dict(const struct printk_log *msg) { return NULL; } --static struct printk_log *log_from_idx(u32 idx) { return NULL; } --static u32 log_next(u32 idx) { return 0; } --static ssize_t msg_print_ext_header(char *buf, size_t size, -- struct printk_log *msg, -- u64 seq) { return 0; } -+ -+static size_t record_print_text(const struct printk_record *r, -+ bool syslog, bool time) -+{ -+ return 0; -+} -+static ssize_t info_print_ext_header(char *buf, size_t size, -+ struct printk_info *info) +-#else /* CONFIG_PRINTK */ ++static int printk_kthread_func(void *data) +{ -+ return 0; -+} - static ssize_t msg_print_ext_body(char *buf, size_t size, -- char *dict, size_t dict_len, -- char *text, size_t text_len) { return 0; } --static void console_lock_spinning_enable(void) { } --static int console_lock_spinning_disable_and_check(void) { return 0; } -+ char *text, size_t text_len, -+ struct dev_printk_info *dev_info) { return 0; } - static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} --static size_t msg_print_text(const struct printk_log *msg, bool syslog, -- bool time, char *buf, size_t size) { return 0; } - static bool suppress_message_printing(int level) { return false; } ++ struct console *con = data; ++ unsigned long dropped = 0; ++ char *dropped_text = NULL; ++ struct printk_info info; ++ struct printk_record r; ++ char *ext_text = NULL; ++ size_t dropped_len; ++ int ret = -ENOMEM; ++ char *text = NULL; ++ char *write_text; ++ u64 printk_seq; ++ size_t len; ++ int error; ++ u64 seq; - #endif /* CONFIG_PRINTK */ -@@ -2350,34 +2331,6 @@ int is_console_locked(void) - } - EXPORT_SYMBOL(is_console_locked); +-#define LOG_LINE_MAX 0 +-#define PREFIX_MAX 0 +-#define printk_time false ++ if (con->flags & CON_EXTENDED) { ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ if (!ext_text) ++ goto out; ++ } ++ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ dropped_text = kmalloc(64, GFP_KERNEL); ++ if (!text || !dropped_text) ++ goto out; --/* -- * Check if we have any console that is capable of printing while cpu is -- * booting or shutting down. Requires console_sem. -- */ --static int have_callable_console(void) +-#define prb_read_valid(rb, seq, r) false +-#define prb_first_valid_seq(rb) 0 ++ if (con->flags & CON_EXTENDED) ++ write_text = ext_text; ++ else ++ write_text = text; + +-static u64 syslog_seq; +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; ++ seq = atomic64_read(&con->printk_seq); + +-static size_t record_print_text(const struct printk_record *r, +- bool syslog, bool time) -{ -- struct console *con; -- -- for_each_console(con) -- if ((con->flags & CON_ENABLED) && -- (con->flags & CON_ANYTIME)) -- return 1; -- - return 0; -} -- --/* -- * Can we actually use the console at this time on this cpu? -- * -- * Console drivers may assume that per-cpu resources have been allocated. So -- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't -- * call them until this CPU is officially up. -- */ --static inline int can_use_console(void) +-static ssize_t info_print_ext_header(char *buf, size_t size, +- struct printk_info *info) -{ -- return cpu_online(raw_smp_processor_id()) || have_callable_console(); +- return 0; -} -- - /** - * console_unlock - unlock the console system - * -@@ -2394,143 +2347,14 @@ static inline int can_use_console(void) - */ - void console_unlock(void) - { -- static char ext_text[CONSOLE_EXT_LOG_MAX]; -- static char text[LOG_LINE_MAX + PREFIX_MAX]; -- unsigned long flags; -- bool do_cond_resched, retry; -- - if (console_suspended) { - up_console_sem(); - return; - } +-static ssize_t msg_print_ext_body(char *buf, size_t size, +- char *text, size_t text_len, +- struct dev_printk_info *dev_info) { return 0; } +-static void console_lock_spinning_enable(void) { } +-static int console_lock_spinning_disable_and_check(void) { return 0; } +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) {} +-static bool suppress_message_printing(int level) { return false; } ++ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); -- /* -- * Console drivers are called with interrupts disabled, so -- * @console_may_schedule should be cleared before; however, we may -- * end up dumping a lot of lines, for example, if called from -- * console registration path, and should invoke cond_resched() -- * between lines if allowable. Not doing so can cause a very long -- * scheduling stall on a slow console leading to RCU stall and -- * softlockup warnings which exacerbate the issue with more -- * messages practically incapacitating the system. -- * -- * console_trylock() is not able to detect the preemptive -- * context reliably. Therefore the value must be stored before -- * and cleared after the the "again" goto label. -- */ -- do_cond_resched = console_may_schedule; --again: -- console_may_schedule = 0; -- -- /* -- * We released the console_sem lock, so we need to recheck if -- * cpu is online and (if not) is there at least one CON_ANYTIME -- * console. -- */ -- if (!can_use_console()) { -- console_locked = 0; -- up_console_sem(); -- return; -- } -- -- for (;;) { -- struct printk_log *msg; -- size_t ext_len = 0; -- size_t len; -- -- printk_safe_enter_irqsave(flags); -- raw_spin_lock(&logbuf_lock); -- if (console_seq < log_first_seq) { -- len = snprintf(text, sizeof(text), -- "** %llu printk messages dropped **\n", -- log_first_seq - console_seq); -- -- /* messages are gone, move to first one */ -- console_seq = log_first_seq; -- console_idx = log_first_idx; -- } else { -- len = 0; -- } --skip: -- if (console_seq == log_next_seq) -- break; -- -- msg = log_from_idx(console_idx); -- if (suppress_message_printing(msg->level)) { -- /* -- * Skip record we have buffered and already printed -- * directly to the console when we received it, and -- * record that has level above the console loglevel. -- */ -- console_idx = log_next(console_idx); -- console_seq++; -- goto skip; -- } +-#endif /* CONFIG_PRINTK */ ++ for (;;) { ++ error = wait_event_interruptible(log_wait, ++ prb_read_valid(prb, seq, &r) || kthread_should_stop()); + +-#ifdef CONFIG_EARLY_PRINTK ++ if (kthread_should_stop()) ++ break; ++ ++ if (error) ++ continue; ++ ++ if (seq != r.info->seq) { ++ dropped += r.info->seq - seq; ++ seq = r.info->seq; ++ } ++ ++ seq++; ++ ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ ++ if (suppress_message_printing(r.info->level)) ++ continue; ++ ++ if (con->flags & CON_EXTENDED) { ++ len = info_print_ext_header(ext_text, ++ CONSOLE_EXT_LOG_MAX, ++ r.info); ++ len += msg_print_ext_body(ext_text + len, ++ CONSOLE_EXT_LOG_MAX - len, ++ &r.text_buf[0], r.info->text_len, ++ &r.info->dev_info); ++ } else { ++ len = record_print_text(&r, ++ console_msg_format & MSG_FORMAT_SYSLOG, ++ printk_time); ++ } ++ ++ printk_seq = atomic64_read(&con->printk_seq); ++ ++ console_lock(); ++ console_may_schedule = 0; ++ ++ if (kernel_sync_mode() && con->write_atomic) { ++ console_unlock(); ++ break; ++ } ++ ++ if (!(con->flags & CON_EXTENDED) && dropped) { ++ dropped_len = snprintf(dropped_text, 64, ++ "** %lu printk messages dropped **\n", ++ dropped); ++ dropped = 0; ++ ++ con->write(con, dropped_text, dropped_len); ++ printk_delay(r.info->level); ++ } ++ ++ con->write(con, write_text, len); ++ if (len) ++ printk_delay(r.info->level); ++ ++ atomic64_cmpxchg_relaxed(&con->printk_seq, printk_seq, seq); ++ ++ console_unlock(); ++ } ++out: ++ kfree(dropped_text); ++ kfree(text); ++ kfree(ext_text); ++ pr_info("%sconsole [%s%d]: printing thread stopped\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return ret; ++} ++ ++/* Must be called within console_lock(). */ ++static void start_printk_kthread(struct console *con) ++{ ++ con->thread = kthread_run(printk_kthread_func, con, ++ "pr/%s%d", con->name, con->index); ++ if (IS_ERR(con->thread)) { ++ pr_err("%sconsole [%s%d]: unable to start printing thread\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return; ++ } ++ pr_info("%sconsole [%s%d]: printing thread started\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++} ++ ++/* protected by console_lock */ ++static bool kthreads_started; ++ ++/* Must be called within console_lock(). */ ++static void console_try_thread(struct console *con) ++{ ++ if (kthreads_started) { ++ start_printk_kthread(con); ++ return; ++ } ++ ++ /* ++ * The printing threads have not been started yet. If this console ++ * can print synchronously, print all unprinted messages. ++ */ ++ if (console_can_sync(con)) ++ print_sync_until(con, prb_next_seq(prb)); ++} ++ ++#else /* CONFIG_PRINTK */ ++ ++#define prb_first_valid_seq(rb) 0 ++#define prb_next_seq(rb) 0 ++ ++#define console_try_thread(con) ++ ++#endif /* CONFIG_PRINTK */ ++ ++#ifdef CONFIG_EARLY_PRINTK + struct console *early_console; + + asmlinkage __visible void early_printk(const char *fmt, ...) +@@ -2420,34 +2546,6 @@ int is_console_locked(void) + } + EXPORT_SYMBOL(is_console_locked); + +-/* +- * Check if we have any console that is capable of printing while cpu is +- * booting or shutting down. Requires console_sem. +- */ +-static int have_callable_console(void) +-{ +- struct console *con; +- +- for_each_console(con) +- if ((con->flags & CON_ENABLED) && +- (con->flags & CON_ANYTIME)) +- return 1; +- +- return 0; +-} +- +-/* +- * Can we actually use the console at this time on this cpu? +- * +- * Console drivers may assume that per-cpu resources have been allocated. So +- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't +- * call them until this CPU is officially up. +- */ +-static inline int can_use_console(void) +-{ +- return cpu_online(raw_smp_processor_id()) || have_callable_console(); +-} +- + /** + * console_unlock - unlock the console system + * +@@ -2464,142 +2562,14 @@ static inline int can_use_console(void) + */ + void console_unlock(void) + { +- static char ext_text[CONSOLE_EXT_LOG_MAX]; +- static char text[LOG_LINE_MAX + PREFIX_MAX]; +- unsigned long flags; +- bool do_cond_resched, retry; +- struct printk_info info; +- struct printk_record r; +- + if (console_suspended) { + up_console_sem(); + return; + } + +- prb_rec_init_rd(&r, &info, text, sizeof(text)); +- +- /* +- * Console drivers are called with interrupts disabled, so +- * @console_may_schedule should be cleared before; however, we may +- * end up dumping a lot of lines, for example, if called from +- * console registration path, and should invoke cond_resched() +- * between lines if allowable. Not doing so can cause a very long +- * scheduling stall on a slow console leading to RCU stall and +- * softlockup warnings which exacerbate the issue with more +- * messages practically incapacitating the system. +- * +- * console_trylock() is not able to detect the preemptive +- * context reliably. Therefore the value must be stored before +- * and cleared after the "again" goto label. +- */ +- do_cond_resched = console_may_schedule; +-again: +- console_may_schedule = 0; +- +- /* +- * We released the console_sem lock, so we need to recheck if +- * cpu is online and (if not) is there at least one CON_ANYTIME +- * console. +- */ +- if (!can_use_console()) { +- console_locked = 0; +- up_console_sem(); +- return; +- } +- +- for (;;) { +- size_t ext_len = 0; +- size_t len; +- +- printk_safe_enter_irqsave(flags); +- raw_spin_lock(&logbuf_lock); +-skip: +- if (!prb_read_valid(prb, console_seq, &r)) +- break; +- +- if (console_seq != r.info->seq) { +- console_dropped += r.info->seq - console_seq; +- console_seq = r.info->seq; +- } +- +- if (suppress_message_printing(r.info->level)) { +- /* +- * Skip record we have buffered and already printed +- * directly to the console when we received it, and +- * record that has level above the console loglevel. +- */ +- console_seq++; +- goto skip; +- } - - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && @@ -14343,19 +12945,23 @@ index 9b75f6bfc333..78a277ea5c35 100644 - exclusive_console = NULL; - } - -- len += msg_print_text(msg, -- console_msg_format & MSG_FORMAT_SYSLOG, -- printk_time, text + len, sizeof(text) - len); +- /* +- * Handle extended console text first because later +- * record_print_text() will modify the record buffer in-place. +- */ - if (nr_ext_console_drivers) { -- ext_len = msg_print_ext_header(ext_text, +- ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), -- msg, console_seq); +- r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, -- log_dict(msg), msg->dict_len, -- log_text(msg), msg->text_len); +- &r.text_buf[0], +- r.info->text_len, +- &r.info->dev_info); - } -- console_idx = log_next(console_idx); +- len = record_print_text(&r, +- console_msg_format & MSG_FORMAT_SYSLOG, +- printk_time); - console_seq++; - raw_spin_unlock(&logbuf_lock); - @@ -14395,7 +13001,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 - * flush, no worries. - */ - raw_spin_lock(&logbuf_lock); -- retry = console_seq != log_next_seq; +- retry = prb_read_valid(prb, console_seq, NULL); - raw_spin_unlock(&logbuf_lock); - printk_safe_exit_irqrestore(flags); - @@ -14404,7 +13010,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 } EXPORT_SYMBOL(console_unlock); -@@ -2580,24 +2404,20 @@ void console_unblank(void) +@@ -2649,23 +2619,20 @@ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { @@ -14428,8 +13034,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 - unsigned long flags; - - logbuf_lock_irqsave(flags); -- console_seq = log_first_seq; -- console_idx = log_first_idx; +- console_seq = prb_first_valid_seq(prb); - logbuf_unlock_irqrestore(flags); + seq = prb_first_valid_seq(prb); + for_each_console(c) @@ -14439,16 +13044,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 console_unlock(); } -@@ -2711,6 +2531,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) - return -ENOENT; - } - -+static void console_try_thread(struct console *con); -+ - /* - * The console driver calls this routine during kernel initialization - * to register the console printing procedure with printk() and to -@@ -2732,7 +2554,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) +@@ -2800,7 +2767,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) */ void register_console(struct console *newcon) { @@ -14456,7 +13052,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 struct console *bcon = NULL; int err; -@@ -2756,6 +2577,8 @@ void register_console(struct console *newcon) +@@ -2824,6 +2790,8 @@ void register_console(struct console *newcon) } } @@ -14465,7 +13061,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; -@@ -2797,8 +2620,10 @@ void register_console(struct console *newcon) +@@ -2865,8 +2833,10 @@ void register_console(struct console *newcon) * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ @@ -14477,7 +13073,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 /* * Put this console in the list - keep the -@@ -2820,27 +2645,12 @@ void register_console(struct console *newcon) +@@ -2888,26 +2858,12 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; @@ -14499,7 +13095,6 @@ index 9b75f6bfc333..78a277ea5c35 100644 - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; - console_seq = syslog_seq; -- console_idx = syslog_idx; - logbuf_unlock_irqrestore(flags); - } + if (newcon->flags & CON_PRINTBUFFER) @@ -14511,7 +13106,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 console_unlock(); console_sysfs_notify(); -@@ -2914,6 +2724,9 @@ int unregister_console(struct console *console) +@@ -2981,6 +2937,9 @@ int unregister_console(struct console *console) console_unlock(); console_sysfs_notify(); @@ -14521,176 +13116,23 @@ index 9b75f6bfc333..78a277ea5c35 100644 if (console->exit) res = console->exit(console); -@@ -2957,6 +2770,154 @@ void __init console_init(void) +@@ -3063,6 +3022,15 @@ static int __init printk_late_init(void) + unregister_console(con); + } } - } - -+static int printk_kthread_func(void *data) -+{ -+ struct console *con = data; -+ unsigned long dropped = 0; -+ struct printk_info info; -+ struct printk_record r; -+ char *ext_text = NULL; -+ size_t dropped_len; -+ char *dropped_text; -+ int ret = -ENOMEM; -+ char *write_text; -+ u64 printk_seq; -+ size_t len; -+ char *text; -+ int error; -+ u64 seq; -+ -+ if (con->flags & CON_EXTENDED) { -+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); -+ if (!ext_text) -+ return ret; -+ } -+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); -+ dropped_text = kmalloc(64, GFP_KERNEL); -+ if (!text || !dropped_text) -+ goto out; -+ -+ if (con->flags & CON_EXTENDED) -+ write_text = ext_text; -+ else -+ write_text = text; -+ -+ seq = atomic64_read(&con->printk_seq); + -+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); -+ -+ for (;;) { -+ error = wait_event_interruptible(log_wait, -+ prb_read_valid(prb, seq, &r) || kthread_should_stop()); -+ -+ if (kthread_should_stop()) -+ break; -+ -+ if (error) -+ continue; -+ -+ if (seq != r.info->seq) { -+ dropped += r.info->seq - seq; -+ seq = r.info->seq; -+ } -+ -+ seq++; -+ -+ if (!(con->flags & CON_ENABLED)) -+ continue; -+ -+ if (suppress_message_printing(r.info->level)) -+ continue; -+ -+ if (con->flags & CON_EXTENDED) { -+ len = info_print_ext_header(ext_text, -+ CONSOLE_EXT_LOG_MAX, -+ r.info); -+ len += msg_print_ext_body(ext_text + len, -+ CONSOLE_EXT_LOG_MAX - len, -+ &r.text_buf[0], r.info->text_len, -+ &r.info->dev_info); -+ } else { -+ len = record_print_text(&r, -+ console_msg_format & MSG_FORMAT_SYSLOG, -+ printk_time); -+ } -+ -+ printk_seq = atomic64_read(&con->printk_seq); -+ -+ console_lock(); -+ console_may_schedule = 0; -+ -+ if (kernel_sync_mode() && con->write_atomic) { -+ console_unlock(); -+ break; -+ } -+ -+ if (!(con->flags & CON_EXTENDED) && dropped) { -+ dropped_len = snprintf(dropped_text, 64, -+ "** %lu printk messages dropped **\n", -+ dropped); -+ dropped = 0; -+ -+ con->write(con, dropped_text, dropped_len); -+ printk_delay(r.info->level); -+ } -+ -+ con->write(con, write_text, len); -+ if (len) -+ printk_delay(r.info->level); -+ -+ atomic64_cmpxchg_relaxed(&con->printk_seq, printk_seq, seq); -+ -+ console_unlock(); -+ } -+out: -+ kfree(dropped_text); -+ kfree(text); -+ kfree(ext_text); -+ pr_info("%sconsole [%s%d]: printing thread stopped\n", -+ (con->flags & CON_BOOT) ? "boot" : "" , -+ con->name, con->index); -+ return ret; -+} -+ -+static void start_printk_kthread(struct console *con) -+{ -+ con->thread = kthread_run(printk_kthread_func, con, -+ "pr/%s%d", con->name, con->index); -+ if (IS_ERR(con->thread)) { -+ pr_err("%sconsole [%s%d]: unable to start printing thread\n", -+ (con->flags & CON_BOOT) ? "boot" : "" , -+ con->name, con->index); -+ return; -+ } -+ pr_info("%sconsole [%s%d]: printing thread started\n", -+ (con->flags & CON_BOOT) ? "boot" : "" , -+ con->name, con->index); -+} -+ -+static bool kthreads_started; -+ -+static void console_try_thread(struct console *con) -+{ -+ unsigned long irqflags; -+ int sprint_id; -+ char *buf; -+ -+ if (kthreads_started) { -+ start_printk_kthread(con); -+ return; -+ } -+ -+ buf = get_sprint_buf(&sprint_id, &irqflags); -+ if (!buf) -+ return; -+ -+ print_sync_until(prb_next_seq(prb), con, buf, PREFIX_MAX + LOG_LINE_MAX); -+ -+ put_sprint_buf(sprint_id, irqflags); -+} -+ - /* - * Some boot consoles access data that is in the init section and which will - * be discarded after the initcalls have been run. To make sure that no code -@@ -2996,6 +2957,13 @@ static int __init printk_late_init(void) - unregister_console(con); - } - } -+ -+ console_lock(); -+ for_each_console(con) -+ start_printk_kthread(con); -+ kthreads_started = true; -+ console_unlock(); ++#ifdef CONFIG_PRINTK ++ console_lock(); ++ for_each_console(con) ++ start_printk_kthread(con); ++ kthreads_started = true; ++ console_unlock(); ++#endif + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); -@@ -3011,7 +2979,6 @@ late_initcall(printk_late_init); +@@ -3078,7 +3046,6 @@ late_initcall(printk_late_init); * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 @@ -14698,7 +13140,7 @@ index 9b75f6bfc333..78a277ea5c35 100644 static DEFINE_PER_CPU(int, printk_pending); -@@ -3019,12 +2986,6 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) +@@ -3086,14 +3053,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = __this_cpu_xchg(printk_pending, 0); @@ -14709,16 +13151,17 @@ index 9b75f6bfc333..78a277ea5c35 100644 - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); +- wake_up_interruptible(&log_wait); ++ wake_up_interruptible_all(&log_wait); } -@@ -3047,25 +3008,10 @@ void wake_up_klogd(void) + + static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = +@@ -3112,25 +3073,10 @@ void wake_up_klogd(void) preempt_enable(); } -void defer_console_output(void) -+__printf(1, 0) -+static int vprintk_deferred(const char *fmt, va_list args) - { +-{ - if (!printk_percpu_data_ready()) - return; - @@ -14729,10 +13172,12 @@ index 9b75f6bfc333..78a277ea5c35 100644 -} - -int vprintk_deferred(const char *fmt, va_list args) --{ ++__printf(1, 0) ++static int vprintk_deferred(const char *fmt, va_list args) + { - int r; - -- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); +- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); - defer_console_output(); - - return r; @@ -14740,76 +13185,11 @@ index 9b75f6bfc333..78a277ea5c35 100644 } int printk_deferred(const char *fmt, ...) -@@ -3194,6 +3140,66 @@ const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) - } - EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); - -+/** -+ * pr_flush() - Wait for printing threads to catch up. -+ * -+ * @timeout_ms: The maximum time (in ms) to wait. -+ * @reset_on_progress: Reset the timeout if forward progress is seen. -+ * -+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 -+ * represents infinite waiting. -+ * -+ * If @reset_on_progress is true, the timeout will be reset whenever any -+ * printer has been seen to make some forward progress. -+ * -+ * Context: Any context if @timeout_ms is 0. Otherwise process context and -+ * may sleep if a printer is not caught up. -+ * Return: true if all enabled printers are caught up. -+ */ -+static bool pr_flush(int timeout_ms, bool reset_on_progress) -+{ -+ int remaining = timeout_ms; -+ struct console *con; -+ u64 last_diff = 0; -+ u64 printk_seq; -+ u64 diff; -+ u64 seq; -+ -+ seq = prb_next_seq(prb); -+ -+ for (;;) { -+ diff = 0; -+ -+ for_each_console(con) { -+ if (!(con->flags & CON_ENABLED)) -+ continue; -+ printk_seq = atomic64_read(&con->printk_seq); -+ if (printk_seq < seq) -+ diff += seq - printk_seq; -+ } -+ -+ if (diff != last_diff && reset_on_progress) -+ remaining = timeout_ms; -+ -+ if (!diff || remaining == 0) -+ break; -+ -+ if (remaining < 0) { -+ msleep(100); -+ } else if (remaining < 100) { -+ msleep(remaining); -+ remaining = 0; -+ } else { -+ msleep(100); -+ remaining -= 100; -+ } -+ -+ last_diff = diff; -+ } -+ -+ return (diff == 0); -+} -+ - /** - * kmsg_dump - dump kernel log to kernel message dumpers. - * @reason: the reason (oops, panic etc) for dumping -@@ -3205,7 +3211,24 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); +@@ -3269,8 +3215,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + */ void kmsg_dump(enum kmsg_dump_reason reason) { ++ struct kmsg_dumper_iter iter; struct kmsg_dumper *dumper; - unsigned long flags; + @@ -14825,80 +13205,112 @@ index 9b75f6bfc333..78a277ea5c35 100644 + } + + /* -+ * Give the printing threads time to flush, allowing up to 1 -+ * second of no printing forward progress before giving up. ++ * Give the printing threads time to flush, allowing up to ++ * 1s of no printing forward progress before giving up. + */ + pr_flush(1000, true); + } rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { -@@ -3225,12 +3248,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) - /* initialize iterator with data about the stored records */ - dumper->active = true; +@@ -3288,25 +3252,18 @@ void kmsg_dump(enum kmsg_dump_reason reason) + continue; + /* initialize iterator with data about the stored records */ +- dumper->active = true; +- - logbuf_lock_irqsave(flags); - dumper->cur_seq = clear_seq; -- dumper->cur_idx = clear_idx; -- dumper->next_seq = log_next_seq; -- dumper->next_idx = log_next_idx; +- dumper->next_seq = prb_next_seq(prb); - logbuf_unlock_irqrestore(flags); -+ kmsg_dump_rewind_nolock(dumper); ++ iter.active = true; ++ kmsg_dump_rewind(&iter); /* invoke dumper which will iterate over records */ - dumper->dump(dumper, reason); -@@ -3263,28 +3281,33 @@ void kmsg_dump(enum kmsg_dump_reason reason) - bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) +- dumper->dump(dumper, reason); +- +- /* reset iterator */ +- dumper->active = false; ++ dumper->dump(dumper, reason, &iter); + } + rcu_read_unlock(); + } + + /** +- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) +- * @dumper: registered kmsg dumper ++ * kmsg_dump_get_line - retrieve one kmsg log line ++ * @iter: kmsg dumper iterator + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer +@@ -3320,11 +3277,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) + * + * A return value of FALSE indicates that there are no more records to + * read. +- * +- * The function is similar to kmsg_dump_get_line(), but grabs no locks. + */ +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, +- char *line, size_t size, size_t *len) ++bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, ++ char *line, size_t size, size_t *len) { -- struct printk_log *msg; -+ struct printk_info info; -+ unsigned int line_count; -+ struct printk_record r; - size_t l = 0; - bool ret = false; + struct printk_info info; + unsigned int line_count; +@@ -3334,16 +3289,16 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, -+ prb_rec_init_rd(&r, &info, line, size); -+ - if (!dumper->active) + prb_rec_init_rd(&r, &info, line, size); + +- if (!dumper->active) ++ if (!iter->active) goto out; -- if (dumper->cur_seq < log_first_seq) { -- /* messages are gone, move to first available one */ -- dumper->cur_seq = log_first_seq; -- dumper->cur_idx = log_first_idx; -- } -- -- /* last entry */ -- if (dumper->cur_seq >= log_next_seq) -- goto out; -+ /* Read text or count text lines? */ -+ if (line) { -+ if (!prb_read_valid(prb, dumper->cur_seq, &r)) -+ goto out; -+ l = record_print_text(&r, syslog, printk_time); -+ } else { -+ if (!prb_read_valid_info(prb, dumper->cur_seq, -+ &info, &line_count)) { -+ goto out; -+ } -+ l = get_record_print_text_size(&info, line_count, syslog, -+ printk_time); + /* Read text or count text lines? */ + if (line) { +- if (!prb_read_valid(prb, dumper->cur_seq, &r)) ++ if (!prb_read_valid(prb, iter->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { +- if (!prb_read_valid_info(prb, dumper->cur_seq, ++ if (!prb_read_valid_info(prb, iter->cur_seq, + &info, &line_count)) { + goto out; + } +@@ -3352,48 +3307,18 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, -- msg = log_from_idx(dumper->cur_idx); -- l = msg_print_text(msg, syslog, printk_time, line, size); -+ } + } -- dumper->cur_idx = log_next(dumper->cur_idx); -- dumper->cur_seq++; -+ dumper->cur_seq = r.info->seq + 1; +- dumper->cur_seq = r.info->seq + 1; ++ iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) -@@ -3312,14 +3335,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, - bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) - { + *len = l; + return ret; + } +- +-/** +- * kmsg_dump_get_line - retrieve one kmsg log line +- * @dumper: registered kmsg dumper +- * @syslog: include the "<4>" prefixes +- * @line: buffer to copy the line to +- * @size: maximum size of the buffer +- * @len: length of line placed into buffer +- * +- * Start at the beginning of the kmsg buffer, with the oldest kmsg +- * record, and copy one record into the provided buffer. +- * +- * Consecutive calls will return the next available record moving +- * towards the end of the buffer with the youngest messages. +- * +- * A return value of FALSE indicates that there are no more records to +- * read. +- */ +-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, +- char *line, size_t size, size_t *len) +-{ - unsigned long flags; - bool ret; - @@ -14907,125 +13319,118 @@ index 9b75f6bfc333..78a277ea5c35 100644 - logbuf_unlock_irqrestore(flags); - - return ret; -+ return kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); - } +-} EXPORT_SYMBOL_GPL(kmsg_dump_get_line); -@@ -3329,7 +3345,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + /** + * kmsg_dump_get_buffer - copy kmsg log lines +- * @dumper: registered kmsg dumper ++ * @iter: kmsg dumper iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer -- * @len: length of line placed into buffer -+ * @len_out: length of line placed into buffer - * - * Start at the end of the kmsg buffer and fill the provided buffer - * with as many of the the *youngest* kmsg records that fit into it. -@@ -3343,75 +3359,73 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); +@@ -3410,114 +3335,254 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + * A return value of FALSE indicates that there are no more records to * read. */ - bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, - char *buf, size_t size, size_t *len) ++bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len_out) { + struct printk_info info; +- unsigned int line_count; + struct printk_record r; - unsigned long flags; -+ struct printk_info info; -+ unsigned int line_count; -+ struct printk_record r; u64 seq; -- u32 idx; u64 next_seq; -- u32 next_idx; - size_t l = 0; + size_t len = 0; bool ret = false; bool time = printk_time; -- if (!dumper->active) -+ if (!dumper->active || !buf || !size) +- prb_rec_init_rd(&r, &info, buf, size); +- +- if (!dumper->active || !buf || !size) ++ if (!iter->active || !buf || !size) goto out; - logbuf_lock_irqsave(flags); -- if (dumper->cur_seq < log_first_seq) { -+ if (dumper->cur_seq < prb_first_valid_seq(prb)) { - /* messages are gone, move to first available one */ -- dumper->cur_seq = log_first_seq; -- dumper->cur_idx = log_first_idx; -+ dumper->cur_seq = prb_first_valid_seq(prb); +- if (dumper->cur_seq < prb_first_valid_seq(prb)) { +- /* messages are gone, move to first available one */ +- dumper->cur_seq = prb_first_valid_seq(prb); ++ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { ++ if (info.seq != iter->cur_seq) { ++ /* messages are gone, move to first available one */ ++ iter->cur_seq = info.seq; ++ } } /* last entry */ - if (dumper->cur_seq >= dumper->next_seq) { - logbuf_unlock_irqrestore(flags); -+ if (dumper->cur_seq >= dumper->next_seq) ++ if (iter->cur_seq >= iter->next_seq) goto out; - } - /* calculate length of entire buffer */ - seq = dumper->cur_seq; -- idx = dumper->cur_idx; -- while (seq < dumper->next_seq) { -- struct printk_log *msg = log_from_idx(idx); -+ /* -+ * Find first record that fits, including all following records, -+ * into the user-provided buffer for this dump. -+ */ - -- l += msg_print_text(msg, true, time, NULL, 0); -- idx = log_next(idx); -- seq++; -+ prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) { -+ if (info.seq >= dumper->next_seq) -+ break; -+ len += get_record_print_text_size(&info, line_count, true, time); - } - +- while (prb_read_valid_info(prb, seq, &info, &line_count)) { +- if (r.info->seq >= dumper->next_seq) +- break; +- l += get_record_print_text_size(&info, line_count, syslog, time); +- seq = r.info->seq + 1; +- } +- - /* move first record forward until length fits into the buffer */ - seq = dumper->cur_seq; -- idx = dumper->cur_idx; -- while (l >= size && seq < dumper->next_seq) { -- struct printk_log *msg = log_from_idx(idx); -- -- l -= msg_print_text(msg, true, time, NULL, 0); -- idx = log_next(idx); -- seq++; +- while (l >= size && prb_read_valid_info(prb, seq, +- &info, &line_count)) { +- if (r.info->seq >= dumper->next_seq) +- break; +- l -= get_record_print_text_size(&info, line_count, syslog, time); +- seq = r.info->seq + 1; +- } + /* -+ * Move first record forward until length fits into the buffer. This -+ * is a best effort attempt. If @dumper->next_seq is reached because -+ * the ringbuffer is wrapping too fast, just start filling the buffer -+ * from there. ++ * Find first record that fits, including all following records, ++ * into the user-provided buffer for this dump. Pass in size-1 ++ * because this function (by way of record_print_text()) will ++ * not write more than size-1 bytes of text into @buf. + */ -+ prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) { -+ if (len <= size || info.seq >= dumper->next_seq) -+ break; -+ len -= get_record_print_text_size(&info, line_count, true, time); - } ++ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, ++ size - 1, syslog, time); - /* last message in next interation */ -+ /* Keep track of the last message for the next interation. */ ++ /* ++ * Next kmsg_dump_get_buffer() invocation will dump block of ++ * older records stored right before this one. ++ */ next_seq = seq; -- next_idx = idx; +- /* actually read text into the buffer now */ - l = 0; -- while (seq < dumper->next_seq) { -- struct printk_log *msg = log_from_idx(idx); +- while (prb_read_valid(prb, seq, &r)) { +- if (r.info->seq >= dumper->next_seq) +- break; + prb_rec_init_rd(&r, &info, buf, size); -- l += msg_print_text(msg, syslog, time, buf + l, size - l); -- idx = log_next(idx); -- seq++; +- l += record_print_text(&r, syslog, time); + len = 0; + prb_for_each_record(seq, prb, seq, &r) { -+ if (r.info->seq >= dumper->next_seq) ++ if (r.info->seq >= iter->next_seq) + break; -+ + +- /* adjust record to store to remaining buffer space */ +- prb_rec_init_rd(&r, &info, buf + l, size - l); + len += record_print_text(&r, syslog, time); -+ + +- seq = r.info->seq + 1; + /* Adjust record to store to remaining buffer space. */ + prb_rec_init_rd(&r, &info, buf + len, size - len); } - dumper->next_seq = next_seq; -- dumper->next_idx = next_idx; +- dumper->next_seq = next_seq; ++ iter->next_seq = next_seq; ret = true; - logbuf_unlock_irqrestore(flags); out: @@ -15036,37 +13441,25 @@ index 9b75f6bfc333..78a277ea5c35 100644 return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); -@@ -3423,15 +3437,11 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + + /** +- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) +- * @dumper: registered kmsg dumper ++ * kmsg_dump_rewind - reset the iterator ++ * @iter: kmsg dumper iterator + * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. -- * -- * The function is similar to kmsg_dump_rewind(), but grabs no locks. - */ - void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) - { -- dumper->cur_seq = clear_seq; -- dumper->cur_idx = clear_idx; -- dumper->next_seq = log_next_seq; -- dumper->next_idx = log_next_idx; -+ dumper->cur_seq = atomic64_read(&clear_seq); -+ dumper->next_seq = prb_next_seq(prb); - } - - /** -@@ -3444,12 +3454,108 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) - */ - void kmsg_dump_rewind(struct kmsg_dumper *dumper) - { -- unsigned long flags; -- -- logbuf_lock_irqsave(flags); - kmsg_dump_rewind_nolock(dumper); -- logbuf_unlock_irqrestore(flags); - } - EXPORT_SYMBOL_GPL(kmsg_dump_rewind); - - #endif ++ */ ++void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) ++{ ++ iter->cur_seq = latched_seq_read_nolock(&clear_seq); ++ iter->next_seq = prb_next_seq(prb); ++} ++EXPORT_SYMBOL_GPL(kmsg_dump_rewind); ++ ++#endif + +struct prb_cpulock { + atomic_t owner; @@ -15111,7 +13504,8 @@ index 9b75f6bfc333..78a277ea5c35 100644 + * prb_lock: Perform a processor-reentrant spin lock. + * @cpu_lock: A pointer to the lock object. + * @cpu_store: A "flags" pointer to store lock status information. -+ * + * +- * The function is similar to kmsg_dump_rewind(), but grabs no locks. + * If no processor has the lock, the calling processor takes the lock and + * becomes the owner. If the calling processor is already the owner of the + * lock, this function succeeds immediately. If lock is locked by another @@ -15119,30 +13513,44 @@ index 9b75f6bfc333..78a277ea5c35 100644 + * owner. + * + * It is safe to call this function from any context and state. -+ */ + */ +-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +static void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store) -+{ + { +- dumper->cur_seq = clear_seq; +- dumper->next_seq = prb_next_seq(prb); + for (;;) { + if (__prb_trylock(cpu_lock, cpu_store)) + break; + cpu_relax(); + } -+} -+ + } + +-/** +- * kmsg_dump_rewind - reset the iterator +- * @dumper: registered kmsg dumper +/* + * prb_unlock: Perform a processor-reentrant spin unlock. + * @cpu_lock: A pointer to the lock object. + * @cpu_store: A "flags" object storing lock status information. -+ * + * +- * Reset the dumper's iterator so that kmsg_dump_get_line() and +- * kmsg_dump_get_buffer() can be called again and used multiple +- * times within the same dumper.dump() callback. + * Release the lock. The calling processor must be the owner of the lock. + * + * It is safe to call this function from any context and state. -+ */ + */ +-void kmsg_dump_rewind(struct kmsg_dumper *dumper) +static void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store) -+{ + { +- unsigned long flags; + unsigned long *flags; + unsigned int cpu; -+ + +- logbuf_lock_irqsave(flags); +- kmsg_dump_rewind_nolock(dumper); +- logbuf_unlock_irqrestore(flags); + cpu = atomic_read(&cpu_lock->owner); + atomic_set_release(&cpu_lock->owner, cpu_store); + @@ -15152,8 +13560,10 @@ index 9b75f6bfc333..78a277ea5c35 100644 + } + + put_cpu(); -+} -+ + } +-EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +-#endif +DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock); + +void console_atomic_lock(unsigned int *flags) @@ -15167,2489 +13577,82 @@ index 9b75f6bfc333..78a277ea5c35 100644 + prb_unlock(&printk_cpulock, flags); +} +EXPORT_SYMBOL(console_atomic_unlock); -diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c -new file mode 100644 -index 000000000000..24a960a89aa8 ---- /dev/null -+++ b/kernel/printk/printk_ringbuffer.c -@@ -0,0 +1,2086 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include <linux/kernel.h> -+#include <linux/irqflags.h> -+#include <linux/string.h> -+#include <linux/errno.h> -+#include <linux/bug.h> -+#include "printk_ringbuffer.h" ++ ++static void pr_msleep(bool may_sleep, int ms) ++{ ++ if (may_sleep) { ++ msleep(ms); ++ } else { ++ while (ms--) ++ udelay(1000); ++ } ++} + +/** -+ * DOC: printk_ringbuffer overview -+ * -+ * Data Structure -+ * -------------- -+ * The printk_ringbuffer is made up of 3 internal ringbuffers: -+ * -+ * desc_ring -+ * A ring of descriptors and their meta data (such as sequence number, -+ * timestamp, loglevel, etc.) as well as internal state information about -+ * the record and logical positions specifying where in the other -+ * ringbuffer the text strings are located. -+ * -+ * text_data_ring -+ * A ring of data blocks. A data block consists of an unsigned long -+ * integer (ID) that maps to a desc_ring index followed by the text -+ * string of the record. -+ * -+ * The internal state information of a descriptor is the key element to allow -+ * readers and writers to locklessly synchronize access to the data. -+ * -+ * Implementation -+ * -------------- -+ * -+ * Descriptor Ring -+ * ~~~~~~~~~~~~~~~ -+ * The descriptor ring is an array of descriptors. A descriptor contains -+ * essential meta data to track the data of a printk record using -+ * blk_lpos structs pointing to associated text data blocks (see -+ * "Data Rings" below). Each descriptor is assigned an ID that maps -+ * directly to index values of the descriptor array and has a state. The ID -+ * and the state are bitwise combined into a single descriptor field named -+ * @state_var, allowing ID and state to be synchronously and atomically -+ * updated. -+ * -+ * Descriptors have four states: -+ * -+ * reserved -+ * A writer is modifying the record. -+ * -+ * committed -+ * The record and all its data are written. A writer can reopen the -+ * descriptor (transitioning it back to reserved), but in the committed -+ * state the data is consistent. -+ * -+ * finalized -+ * The record and all its data are complete and available for reading. A -+ * writer cannot reopen the descriptor. -+ * -+ * reusable -+ * The record exists, but its text and/or meta data may no longer be -+ * available. -+ * -+ * Querying the @state_var of a record requires providing the ID of the -+ * descriptor to query. This can yield a possible fifth (pseudo) state: -+ * -+ * miss -+ * The descriptor being queried has an unexpected ID. -+ * -+ * The descriptor ring has a @tail_id that contains the ID of the oldest -+ * descriptor and @head_id that contains the ID of the newest descriptor. -+ * -+ * When a new descriptor should be created (and the ring is full), the tail -+ * descriptor is invalidated by first transitioning to the reusable state and -+ * then invalidating all tail data blocks up to and including the data blocks -+ * associated with the tail descriptor (for the text ring). Then -+ * @tail_id is advanced, followed by advancing @head_id. And finally the -+ * @state_var of the new descriptor is initialized to the new ID and reserved -+ * state. -+ * -+ * The @tail_id can only be advanced if the new @tail_id would be in the -+ * committed or reusable queried state. This makes it possible that a valid -+ * sequence number of the tail is always available. -+ * -+ * Descriptor Finalization -+ * ~~~~~~~~~~~~~~~~~~~~~~~ -+ * When a writer calls the commit function prb_commit(), record data is -+ * fully stored and is consistent within the ringbuffer. However, a writer can -+ * reopen that record, claiming exclusive access (as with prb_reserve()), and -+ * modify that record. When finished, the writer must again commit the record. -+ * -+ * In order for a record to be made available to readers (and also become -+ * recyclable for writers), it must be finalized. A finalized record cannot be -+ * reopened and can never become "unfinalized". Record finalization can occur -+ * in three different scenarios: -+ * -+ * 1) A writer can simultaneously commit and finalize its record by calling -+ * prb_final_commit() instead of prb_commit(). -+ * -+ * 2) When a new record is reserved and the previous record has been -+ * committed via prb_commit(), that previous record is automatically -+ * finalized. -+ * -+ * 3) When a record is committed via prb_commit() and a newer record -+ * already exists, the record being committed is automatically finalized. -+ * -+ * Data Ring -+ * ~~~~~~~~~ -+ * The text data ring is a byte array composed of data blocks. Data blocks are -+ * referenced by blk_lpos structs that point to the logical position of the -+ * beginning of a data block and the beginning of the next adjacent data -+ * block. Logical positions are mapped directly to index values of the byte -+ * array ringbuffer. -+ * -+ * Each data block consists of an ID followed by the writer data. The ID is -+ * the identifier of a descriptor that is associated with the data block. A -+ * given data block is considered valid if all of the following conditions -+ * are met: -+ * -+ * 1) The descriptor associated with the data block is in the committed -+ * or finalized queried state. -+ * -+ * 2) The blk_lpos struct within the descriptor associated with the data -+ * block references back to the same data block. -+ * -+ * 3) The data block is within the head/tail logical position range. -+ * -+ * If the writer data of a data block would extend beyond the end of the -+ * byte array, only the ID of the data block is stored at the logical -+ * position and the full data block (ID and writer data) is stored at the -+ * beginning of the byte array. The referencing blk_lpos will point to the -+ * ID before the wrap and the next data block will be at the logical -+ * position adjacent the full data block after the wrap. -+ * -+ * Data rings have a @tail_lpos that points to the beginning of the oldest -+ * data block and a @head_lpos that points to the logical position of the -+ * next (not yet existing) data block. -+ * -+ * When a new data block should be created (and the ring is full), tail data -+ * blocks will first be invalidated by putting their associated descriptors -+ * into the reusable state and then pushing the @tail_lpos forward beyond -+ * them. Then the @head_lpos is pushed forward and is associated with a new -+ * descriptor. If a data block is not valid, the @tail_lpos cannot be -+ * advanced beyond it. -+ * -+ * Info Array -+ * ~~~~~~~~~~ -+ * The general meta data of printk records are stored in printk_info structs, -+ * stored in an array with the same number of elements as the descriptor ring. -+ * Each info corresponds to the descriptor of the same index in the -+ * descriptor ring. Info validity is confirmed by evaluating the corresponding -+ * descriptor before and after loading the info. -+ * -+ * Usage -+ * ----- -+ * Here are some simple examples demonstrating writers and readers. For the -+ * examples a global ringbuffer (test_rb) is available (which is not the -+ * actual ringbuffer used by printk):: -+ * -+ * DEFINE_PRINTKRB(test_rb, 15, 5); -+ * -+ * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of -+ * 1 MiB (2 ^ (15 + 5)) for text data. -+ * -+ * Sample writer code:: -+ * -+ * const char *textstr = "message text"; -+ * struct prb_reserved_entry e; -+ * struct printk_record r; -+ * -+ * // specify how much to allocate -+ * prb_rec_init_wr(&r, strlen(textstr) + 1); -+ * -+ * if (prb_reserve(&e, &test_rb, &r)) { -+ * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); -+ * -+ * r.info->text_len = strlen(textstr); -+ * r.info->ts_nsec = local_clock(); -+ * r.info->caller_id = printk_caller_id(); -+ * -+ * // commit and finalize the record -+ * prb_final_commit(&e); -+ * } -+ * -+ * Note that additional writer functions are available to extend a record -+ * after it has been committed but not yet finalized. This can be done as -+ * long as no new records have been reserved and the caller is the same. -+ * -+ * Sample writer code (record extending):: -+ * -+ * // alternate rest of previous example -+ * -+ * r.info->text_len = strlen(textstr); -+ * r.info->ts_nsec = local_clock(); -+ * r.info->caller_id = printk_caller_id(); -+ * -+ * // commit the record (but do not finalize yet) -+ * prb_commit(&e); -+ * } -+ * -+ * ... -+ * -+ * // specify additional 5 bytes text space to extend -+ * prb_rec_init_wr(&r, 5); -+ * -+ * // try to extend, but only if it does not exceed 32 bytes -+ * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) { -+ * snprintf(&r.text_buf[r.info->text_len], -+ * r.text_buf_size - r.info->text_len, "hello"); -+ * -+ * r.info->text_len += 5; -+ * -+ * // commit and finalize the record -+ * prb_final_commit(&e); -+ * } -+ * -+ * Sample reader code:: -+ * -+ * struct printk_info info; -+ * struct printk_record r; -+ * char text_buf[32]; -+ * u64 seq; -+ * -+ * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); -+ * -+ * prb_for_each_record(0, &test_rb, &seq, &r) { -+ * if (info.seq != seq) -+ * pr_warn("lost %llu records\n", info.seq - seq); -+ * -+ * if (info.text_len > r.text_buf_size) { -+ * pr_warn("record %llu text truncated\n", info.seq); -+ * text_buf[r.text_buf_size - 1] = 0; -+ * } -+ * -+ * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, -+ * &text_buf[0]); -+ * } -+ * -+ * Note that additional less convenient reader functions are available to -+ * allow complex record access. -+ * -+ * ABA Issues -+ * ~~~~~~~~~~ -+ * To help avoid ABA issues, descriptors are referenced by IDs (array index -+ * values combined with tagged bits counting array wraps) and data blocks are -+ * referenced by logical positions (array index values combined with tagged -+ * bits counting array wraps). However, on 32-bit systems the number of -+ * tagged bits is relatively small such that an ABA incident is (at least -+ * theoretically) possible. For example, if 4 million maximally sized (1KiB) -+ * printk messages were to occur in NMI context on a 32-bit system, the -+ * interrupted context would not be able to recognize that the 32-bit integer -+ * completely wrapped and thus represents a different data block than the one -+ * the interrupted context expects. -+ * -+ * To help combat this possibility, additional state checking is performed -+ * (such as using cmpxchg() even though set() would suffice). These extra -+ * checks are commented as such and will hopefully catch any ABA issue that -+ * a 32-bit system might experience. -+ * -+ * Memory Barriers -+ * ~~~~~~~~~~~~~~~ -+ * Multiple memory barriers are used. To simplify proving correctness and -+ * generating litmus tests, lines of code related to memory barriers -+ * (loads, stores, and the associated memory barriers) are labeled:: -+ * -+ * LMM(function:letter) -+ * -+ * Comments reference the labels using only the "function:letter" part. -+ * -+ * The memory barrier pairs and their ordering are: -+ * -+ * desc_reserve:D / desc_reserve:B -+ * push descriptor tail (id), then push descriptor head (id) -+ * -+ * desc_reserve:D / data_push_tail:B -+ * push data tail (lpos), then set new descriptor reserved (state) -+ * -+ * desc_reserve:D / desc_push_tail:C -+ * push descriptor tail (id), then set new descriptor reserved (state) -+ * -+ * desc_reserve:D / prb_first_seq:C -+ * push descriptor tail (id), then set new descriptor reserved (state) -+ * -+ * desc_reserve:F / desc_read:D -+ * set new descriptor id and reserved (state), then allow writer changes -+ * -+ * data_alloc:A (or data_realloc:A) / desc_read:D -+ * set old descriptor reusable (state), then modify new data block area -+ * -+ * data_alloc:A (or data_realloc:A) / data_push_tail:B -+ * push data tail (lpos), then modify new data block area -+ * -+ * _prb_commit:B / desc_read:B -+ * store writer changes, then set new descriptor committed (state) -+ * -+ * desc_reopen_last:A / _prb_commit:B -+ * set descriptor reserved (state), then read descriptor data -+ * -+ * _prb_commit:B / desc_reserve:D -+ * set new descriptor committed (state), then check descriptor head (id) ++ * pr_flush() - Wait for printing threads to catch up. + * -+ * data_push_tail:D / data_push_tail:A -+ * set descriptor reusable (state), then push data tail (lpos) ++ * @timeout_ms: The maximum time (in ms) to wait. ++ * @reset_on_progress: Reset the timeout if forward progress is seen. + * -+ * desc_push_tail:B / desc_reserve:D -+ * set descriptor reusable (state), then push descriptor tail (id) -+ */ -+ -+#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) -+#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) -+ -+#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) -+#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) -+ -+/* Determine the data array index from a logical position. */ -+#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) -+ -+/* Determine the desc array index from an ID or sequence number. */ -+#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) -+ -+/* Determine how many times the data array has wrapped. */ -+#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) -+ -+/* Determine if a logical position refers to a data-less block. */ -+#define LPOS_DATALESS(lpos) ((lpos) & 1UL) -+#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ -+ LPOS_DATALESS((blk)->next)) -+ -+/* Get the logical position at index 0 of the current wrap. */ -+#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ -+((lpos) & ~DATA_SIZE_MASK(data_ring)) -+ -+/* Get the ID for the same index of the previous wrap as the given ID. */ -+#define DESC_ID_PREV_WRAP(desc_ring, id) \ -+DESC_ID((id) - DESCS_COUNT(desc_ring)) -+ -+/* -+ * A data block: mapped directly to the beginning of the data block area -+ * specified as a logical position within the data ring. ++ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 ++ * represents infinite waiting. + * -+ * @id: the ID of the associated descriptor -+ * @data: the writer data ++ * If @reset_on_progress is true, the timeout will be reset whenever any ++ * printer has been seen to make some forward progress. + * -+ * Note that the size of a data block is only known by its associated -+ * descriptor. -+ */ -+struct prb_data_block { -+ unsigned long id; -+ char data[0]; -+}; -+ -+/* -+ * Return the descriptor associated with @n. @n can be either a -+ * descriptor ID or a sequence number. -+ */ -+static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) -+{ -+ return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; -+} -+ -+/* -+ * Return the printk_info associated with @n. @n can be either a -+ * descriptor ID or a sequence number. -+ */ -+static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) -+{ -+ return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; -+} -+ -+static struct prb_data_block *to_block(struct prb_data_ring *data_ring, -+ unsigned long begin_lpos) -+{ -+ return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; -+} -+ -+/* -+ * Increase the data size to account for data block meta data plus any -+ * padding so that the adjacent data block is aligned on the ID size. ++ * Context: Any context. ++ * Return: true if all enabled printers are caught up. + */ -+static unsigned int to_blk_size(unsigned int size) ++bool pr_flush(int timeout_ms, bool reset_on_progress) +{ -+ struct prb_data_block *db = NULL; ++ int remaining = timeout_ms; ++ struct console *con; ++ u64 last_diff = 0; ++ bool may_sleep; ++ u64 printk_seq; ++ u64 diff; ++ u64 seq; + -+ size += sizeof(*db); -+ size = ALIGN(size, sizeof(db->id)); -+ return size; -+} ++ may_sleep = (preemptible() && !in_softirq()); + -+/* -+ * Sanity checker for reserve size. The ringbuffer code assumes that a data -+ * block does not exceed the maximum possible size that could fit within the -+ * ringbuffer. This function provides that basic size check so that the -+ * assumption is safe. -+ */ -+static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) -+{ -+ struct prb_data_block *db = NULL; ++ seq = prb_next_seq(prb); + -+ if (size == 0) -+ return true; ++ for (;;) { ++ diff = 0; + -+ /* -+ * Ensure the alignment padded size could possibly fit in the data -+ * array. The largest possible data block must still leave room for -+ * at least the ID of the next block. -+ */ -+ size = to_blk_size(size); -+ if (size > DATA_SIZE(data_ring) - sizeof(db->id)) -+ return false; ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ printk_seq = atomic64_read(&con->printk_seq); ++ if (printk_seq < seq) ++ diff += seq - printk_seq; ++ } + -+ return true; -+} ++ if (diff != last_diff && reset_on_progress) ++ remaining = timeout_ms; + -+/* Query the state of a descriptor. */ -+static enum desc_state get_desc_state(unsigned long id, -+ unsigned long state_val) -+{ -+ if (id != DESC_ID(state_val)) -+ return desc_miss; ++ if (!diff || remaining == 0) ++ break; + -+ return DESC_STATE(state_val); -+} ++ if (remaining < 0) { ++ pr_msleep(may_sleep, 100); ++ } else if (remaining < 100) { ++ pr_msleep(may_sleep, remaining); ++ remaining = 0; ++ } else { ++ pr_msleep(may_sleep, 100); ++ remaining -= 100; ++ } + -+/* -+ * Get a copy of a specified descriptor and return its queried state. If the -+ * descriptor is in an inconsistent state (miss or reserved), the caller can -+ * only expect the descriptor's @state_var field to be valid. -+ * -+ * The sequence number and caller_id can be optionally retrieved. Like all -+ * non-state_var data, they are only valid if the descriptor is in a -+ * consistent state. -+ */ -+static enum desc_state desc_read(struct prb_desc_ring *desc_ring, -+ unsigned long id, struct prb_desc *desc_out, -+ u64 *seq_out, u32 *caller_id_out) -+{ -+ struct printk_info *info = to_info(desc_ring, id); -+ struct prb_desc *desc = to_desc(desc_ring, id); -+ atomic_long_t *state_var = &desc->state_var; -+ enum desc_state d_state; -+ unsigned long state_val; -+ -+ /* Check the descriptor state. */ -+ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ -+ d_state = get_desc_state(id, state_val); -+ if (d_state == desc_miss || d_state == desc_reserved) { -+ /* -+ * The descriptor is in an inconsistent state. Set at least -+ * @state_var so that the caller can see the details of -+ * the inconsistent state. -+ */ -+ goto out; ++ last_diff = diff; + } + -+ /* -+ * Guarantee the state is loaded before copying the descriptor -+ * content. This avoids copying obsolete descriptor content that might -+ * not apply to the descriptor state. This pairs with _prb_commit:B. -+ * -+ * Memory barrier involvement: -+ * -+ * If desc_read:A reads from _prb_commit:B, then desc_read:C reads -+ * from _prb_commit:A. -+ * -+ * Relies on: -+ * -+ * WMB from _prb_commit:A to _prb_commit:B -+ * matching -+ * RMB from desc_read:A to desc_read:C -+ */ -+ smp_rmb(); /* LMM(desc_read:B) */ -+ -+ /* -+ * Copy the descriptor data. The data is not valid until the -+ * state has been re-checked. A memcpy() for all of @desc -+ * cannot be used because of the atomic_t @state_var field. -+ */ -+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, -+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ -+ if (seq_out) -+ *seq_out = info->seq; /* also part of desc_read:C */ -+ if (caller_id_out) -+ *caller_id_out = info->caller_id; /* also part of desc_read:C */ -+ -+ /* -+ * 1. Guarantee the descriptor content is loaded before re-checking -+ * the state. This avoids reading an obsolete descriptor state -+ * that may not apply to the copied content. This pairs with -+ * desc_reserve:F. -+ * -+ * Memory barrier involvement: -+ * -+ * If desc_read:C reads from desc_reserve:G, then desc_read:E -+ * reads from desc_reserve:F. -+ * -+ * Relies on: -+ * -+ * WMB from desc_reserve:F to desc_reserve:G -+ * matching -+ * RMB from desc_read:C to desc_read:E -+ * -+ * 2. Guarantee the record data is loaded before re-checking the -+ * state. This avoids reading an obsolete descriptor state that may -+ * not apply to the copied data. This pairs with data_alloc:A and -+ * data_realloc:A. -+ * -+ * Memory barrier involvement: -+ * -+ * If copy_data:A reads from data_alloc:B, then desc_read:E -+ * reads from desc_make_reusable:A. -+ * -+ * Relies on: -+ * -+ * MB from desc_make_reusable:A to data_alloc:B -+ * matching -+ * RMB from desc_read:C to desc_read:E -+ * -+ * Note: desc_make_reusable:A and data_alloc:B can be different -+ * CPUs. However, the data_alloc:B CPU (which performs the -+ * full memory barrier) must have previously seen -+ * desc_make_reusable:A. -+ */ -+ smp_rmb(); /* LMM(desc_read:D) */ -+ -+ /* -+ * The data has been copied. Return the current descriptor state, -+ * which may have changed since the load above. -+ */ -+ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ -+ d_state = get_desc_state(id, state_val); -+out: -+ atomic_long_set(&desc_out->state_var, state_val); -+ return d_state; -+} -+ -+/* -+ * Take a specified descriptor out of the finalized state by attempting -+ * the transition from finalized to reusable. Either this context or some -+ * other context will have been successful. -+ */ -+static void desc_make_reusable(struct prb_desc_ring *desc_ring, -+ unsigned long id) -+{ -+ unsigned long val_finalized = DESC_SV(id, desc_finalized); -+ unsigned long val_reusable = DESC_SV(id, desc_reusable); -+ struct prb_desc *desc = to_desc(desc_ring, id); -+ atomic_long_t *state_var = &desc->state_var; -+ -+ atomic_long_cmpxchg_relaxed(state_var, val_finalized, -+ val_reusable); /* LMM(desc_make_reusable:A) */ ++ return (diff == 0); +} -+ -+/* -+ * Given the text data ring, put the associated descriptor of each -+ * data block from @lpos_begin until @lpos_end into the reusable state. -+ * -+ * If there is any problem making the associated descriptor reusable, either -+ * the descriptor has not yet been finalized or another writer context has -+ * already pushed the tail lpos past the problematic data block. Regardless, -+ * on error the caller can re-load the tail lpos to determine the situation. -+ */ -+static bool data_make_reusable(struct printk_ringbuffer *rb, -+ struct prb_data_ring *data_ring, -+ unsigned long lpos_begin, -+ unsigned long lpos_end, -+ unsigned long *lpos_out) -+{ -+ struct prb_desc_ring *desc_ring = &rb->desc_ring; -+ struct prb_data_block *blk; -+ enum desc_state d_state; -+ struct prb_desc desc; -+ struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; -+ unsigned long id; -+ -+ /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ -+ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { -+ blk = to_block(data_ring, lpos_begin); -+ -+ /* -+ * Load the block ID from the data block. This is a data race -+ * against a writer that may have newly reserved this data -+ * area. If the loaded value matches a valid descriptor ID, -+ * the blk_lpos of that descriptor will be checked to make -+ * sure it points back to this data block. If the check fails, -+ * the data area has been recycled by another writer. -+ */ -+ id = blk->id; /* LMM(data_make_reusable:A) */ -+ -+ d_state = desc_read(desc_ring, id, &desc, -+ NULL, NULL); /* LMM(data_make_reusable:B) */ -+ -+ switch (d_state) { -+ case desc_miss: -+ case desc_reserved: -+ case desc_committed: -+ return false; -+ case desc_finalized: -+ /* -+ * This data block is invalid if the descriptor -+ * does not point back to it. -+ */ -+ if (blk_lpos->begin != lpos_begin) -+ return false; -+ desc_make_reusable(desc_ring, id); -+ break; -+ case desc_reusable: -+ /* -+ * This data block is invalid if the descriptor -+ * does not point back to it. -+ */ -+ if (blk_lpos->begin != lpos_begin) -+ return false; -+ break; -+ } -+ -+ /* Advance @lpos_begin to the next data block. */ -+ lpos_begin = blk_lpos->next; -+ } -+ -+ *lpos_out = lpos_begin; -+ return true; -+} -+ -+/* -+ * Advance the data ring tail to at least @lpos. This function puts -+ * descriptors into the reusable state if the tail is pushed beyond -+ * their associated data block. -+ */ -+static bool data_push_tail(struct printk_ringbuffer *rb, -+ struct prb_data_ring *data_ring, -+ unsigned long lpos) -+{ -+ unsigned long tail_lpos_new; -+ unsigned long tail_lpos; -+ unsigned long next_lpos; -+ -+ /* If @lpos is from a data-less block, there is nothing to do. */ -+ if (LPOS_DATALESS(lpos)) -+ return true; -+ -+ /* -+ * Any descriptor states that have transitioned to reusable due to the -+ * data tail being pushed to this loaded value will be visible to this -+ * CPU. This pairs with data_push_tail:D. -+ * -+ * Memory barrier involvement: -+ * -+ * If data_push_tail:A reads from data_push_tail:D, then this CPU can -+ * see desc_make_reusable:A. -+ * -+ * Relies on: -+ * -+ * MB from desc_make_reusable:A to data_push_tail:D -+ * matches -+ * READFROM from data_push_tail:D to data_push_tail:A -+ * thus -+ * READFROM from desc_make_reusable:A to this CPU -+ */ -+ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ -+ -+ /* -+ * Loop until the tail lpos is at or beyond @lpos. This condition -+ * may already be satisfied, resulting in no full memory barrier -+ * from data_push_tail:D being performed. However, since this CPU -+ * sees the new tail lpos, any descriptor states that transitioned to -+ * the reusable state must already be visible. -+ */ -+ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { -+ /* -+ * Make all descriptors reusable that are associated with -+ * data blocks before @lpos. -+ */ -+ if (!data_make_reusable(rb, data_ring, tail_lpos, lpos, -+ &next_lpos)) { -+ /* -+ * 1. Guarantee the block ID loaded in -+ * data_make_reusable() is performed before -+ * reloading the tail lpos. The failed -+ * data_make_reusable() may be due to a newly -+ * recycled data area causing the tail lpos to -+ * have been previously pushed. This pairs with -+ * data_alloc:A and data_realloc:A. -+ * -+ * Memory barrier involvement: -+ * -+ * If data_make_reusable:A reads from data_alloc:B, -+ * then data_push_tail:C reads from -+ * data_push_tail:D. -+ * -+ * Relies on: -+ * -+ * MB from data_push_tail:D to data_alloc:B -+ * matching -+ * RMB from data_make_reusable:A to -+ * data_push_tail:C -+ * -+ * Note: data_push_tail:D and data_alloc:B can be -+ * different CPUs. However, the data_alloc:B -+ * CPU (which performs the full memory -+ * barrier) must have previously seen -+ * data_push_tail:D. -+ * -+ * 2. Guarantee the descriptor state loaded in -+ * data_make_reusable() is performed before -+ * reloading the tail lpos. The failed -+ * data_make_reusable() may be due to a newly -+ * recycled descriptor causing the tail lpos to -+ * have been previously pushed. This pairs with -+ * desc_reserve:D. -+ * -+ * Memory barrier involvement: -+ * -+ * If data_make_reusable:B reads from -+ * desc_reserve:F, then data_push_tail:C reads -+ * from data_push_tail:D. -+ * -+ * Relies on: -+ * -+ * MB from data_push_tail:D to desc_reserve:F -+ * matching -+ * RMB from data_make_reusable:B to -+ * data_push_tail:C -+ * -+ * Note: data_push_tail:D and desc_reserve:F can -+ * be different CPUs. However, the -+ * desc_reserve:F CPU (which performs the -+ * full memory barrier) must have previously -+ * seen data_push_tail:D. -+ */ -+ smp_rmb(); /* LMM(data_push_tail:B) */ -+ -+ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos -+ ); /* LMM(data_push_tail:C) */ -+ if (tail_lpos_new == tail_lpos) -+ return false; -+ -+ /* Another CPU pushed the tail. Try again. */ -+ tail_lpos = tail_lpos_new; -+ continue; -+ } -+ -+ /* -+ * Guarantee any descriptor states that have transitioned to -+ * reusable are stored before pushing the tail lpos. A full -+ * memory barrier is needed since other CPUs may have made -+ * the descriptor states reusable. This pairs with -+ * data_push_tail:A. -+ */ -+ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, -+ next_lpos)) { /* LMM(data_push_tail:D) */ -+ break; -+ } -+ } -+ -+ return true; -+} -+ -+/* -+ * Advance the desc ring tail. This function advances the tail by one -+ * descriptor, thus invalidating the oldest descriptor. Before advancing -+ * the tail, the tail descriptor is made reusable and all data blocks up to -+ * and including the descriptor's data block are invalidated (i.e. the data -+ * ring tail is pushed past the data block of the descriptor being made -+ * reusable). -+ */ -+static bool desc_push_tail(struct printk_ringbuffer *rb, -+ unsigned long tail_id) -+{ -+ struct prb_desc_ring *desc_ring = &rb->desc_ring; -+ enum desc_state d_state; -+ struct prb_desc desc; -+ -+ d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); -+ -+ switch (d_state) { -+ case desc_miss: -+ /* -+ * If the ID is exactly 1 wrap behind the expected, it is -+ * in the process of being reserved by another writer and -+ * must be considered reserved. -+ */ -+ if (DESC_ID(atomic_long_read(&desc.state_var)) == -+ DESC_ID_PREV_WRAP(desc_ring, tail_id)) { -+ return false; -+ } -+ -+ /* -+ * The ID has changed. Another writer must have pushed the -+ * tail and recycled the descriptor already. Success is -+ * returned because the caller is only interested in the -+ * specified tail being pushed, which it was. -+ */ -+ return true; -+ case desc_reserved: -+ case desc_committed: -+ return false; -+ case desc_finalized: -+ desc_make_reusable(desc_ring, tail_id); -+ break; -+ case desc_reusable: -+ break; -+ } -+ -+ /* -+ * Data blocks must be invalidated before their associated -+ * descriptor can be made available for recycling. Invalidating -+ * them later is not possible because there is no way to trust -+ * data blocks once their associated descriptor is gone. -+ */ -+ -+ if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) -+ return false; -+ -+ /* -+ * Check the next descriptor after @tail_id before pushing the tail -+ * to it because the tail must always be in a finalized or reusable -+ * state. The implementation of prb_first_seq() relies on this. -+ * -+ * A successful read implies that the next descriptor is less than or -+ * equal to @head_id so there is no risk of pushing the tail past the -+ * head. -+ */ -+ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, -+ NULL, NULL); /* LMM(desc_push_tail:A) */ -+ -+ if (d_state == desc_finalized || d_state == desc_reusable) { -+ /* -+ * Guarantee any descriptor states that have transitioned to -+ * reusable are stored before pushing the tail ID. This allows -+ * verifying the recycled descriptor state. A full memory -+ * barrier is needed since other CPUs may have made the -+ * descriptor states reusable. This pairs with desc_reserve:D. -+ */ -+ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, -+ DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ -+ } else { -+ /* -+ * Guarantee the last state load from desc_read() is before -+ * reloading @tail_id in order to see a new tail ID in the -+ * case that the descriptor has been recycled. This pairs -+ * with desc_reserve:D. -+ * -+ * Memory barrier involvement: -+ * -+ * If desc_push_tail:A reads from desc_reserve:F, then -+ * desc_push_tail:D reads from desc_push_tail:B. -+ * -+ * Relies on: -+ * -+ * MB from desc_push_tail:B to desc_reserve:F -+ * matching -+ * RMB from desc_push_tail:A to desc_push_tail:D -+ * -+ * Note: desc_push_tail:B and desc_reserve:F can be different -+ * CPUs. However, the desc_reserve:F CPU (which performs -+ * the full memory barrier) must have previously seen -+ * desc_push_tail:B. -+ */ -+ smp_rmb(); /* LMM(desc_push_tail:C) */ -+ -+ /* -+ * Re-check the tail ID. The descriptor following @tail_id is -+ * not in an allowed tail state. But if the tail has since -+ * been moved by another CPU, then it does not matter. -+ */ -+ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ -+ return false; -+ } -+ -+ return true; -+} -+ -+/* Reserve a new descriptor, invalidating the oldest if necessary. */ -+static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) -+{ -+ struct prb_desc_ring *desc_ring = &rb->desc_ring; -+ unsigned long prev_state_val; -+ unsigned long id_prev_wrap; -+ struct prb_desc *desc; -+ unsigned long head_id; -+ unsigned long id; -+ -+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ -+ -+ do { -+ desc = to_desc(desc_ring, head_id); -+ -+ id = DESC_ID(head_id + 1); -+ id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); -+ -+ /* -+ * Guarantee the head ID is read before reading the tail ID. -+ * Since the tail ID is updated before the head ID, this -+ * guarantees that @id_prev_wrap is never ahead of the tail -+ * ID. This pairs with desc_reserve:D. -+ * -+ * Memory barrier involvement: -+ * -+ * If desc_reserve:A reads from desc_reserve:D, then -+ * desc_reserve:C reads from desc_push_tail:B. -+ * -+ * Relies on: -+ * -+ * MB from desc_push_tail:B to desc_reserve:D -+ * matching -+ * RMB from desc_reserve:A to desc_reserve:C -+ * -+ * Note: desc_push_tail:B and desc_reserve:D can be different -+ * CPUs. However, the desc_reserve:D CPU (which performs -+ * the full memory barrier) must have previously seen -+ * desc_push_tail:B. -+ */ -+ smp_rmb(); /* LMM(desc_reserve:B) */ -+ -+ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id -+ )) { /* LMM(desc_reserve:C) */ -+ /* -+ * Make space for the new descriptor by -+ * advancing the tail. -+ */ -+ if (!desc_push_tail(rb, id_prev_wrap)) -+ return false; -+ } -+ -+ /* -+ * 1. Guarantee the tail ID is read before validating the -+ * recycled descriptor state. A read memory barrier is -+ * sufficient for this. This pairs with desc_push_tail:B. -+ * -+ * Memory barrier involvement: -+ * -+ * If desc_reserve:C reads from desc_push_tail:B, then -+ * desc_reserve:E reads from desc_make_reusable:A. -+ * -+ * Relies on: -+ * -+ * MB from desc_make_reusable:A to desc_push_tail:B -+ * matching -+ * RMB from desc_reserve:C to desc_reserve:E -+ * -+ * Note: desc_make_reusable:A and desc_push_tail:B can be -+ * different CPUs. However, the desc_push_tail:B CPU -+ * (which performs the full memory barrier) must have -+ * previously seen desc_make_reusable:A. -+ * -+ * 2. Guarantee the tail ID is stored before storing the head -+ * ID. This pairs with desc_reserve:B. -+ * -+ * 3. Guarantee any data ring tail changes are stored before -+ * recycling the descriptor. Data ring tail changes can -+ * happen via desc_push_tail()->data_push_tail(). A full -+ * memory barrier is needed since another CPU may have -+ * pushed the data ring tails. This pairs with -+ * data_push_tail:B. -+ * -+ * 4. Guarantee a new tail ID is stored before recycling the -+ * descriptor. A full memory barrier is needed since -+ * another CPU may have pushed the tail ID. This pairs -+ * with desc_push_tail:C and this also pairs with -+ * prb_first_seq:C. -+ * -+ * 5. Guarantee the head ID is stored before trying to -+ * finalize the previous descriptor. This pairs with -+ * _prb_commit:B. -+ */ -+ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, -+ id)); /* LMM(desc_reserve:D) */ -+ -+ desc = to_desc(desc_ring, id); -+ -+ /* -+ * If the descriptor has been recycled, verify the old state val. -+ * See "ABA Issues" about why this verification is performed. -+ */ -+ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ -+ if (prev_state_val && -+ get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { -+ WARN_ON_ONCE(1); -+ return false; -+ } -+ -+ /* -+ * Assign the descriptor a new ID and set its state to reserved. -+ * See "ABA Issues" about why cmpxchg() instead of set() is used. -+ * -+ * Guarantee the new descriptor ID and state is stored before making -+ * any other changes. A write memory barrier is sufficient for this. -+ * This pairs with desc_read:D. -+ */ -+ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, -+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ -+ WARN_ON_ONCE(1); -+ return false; -+ } -+ -+ /* Now data in @desc can be modified: LMM(desc_reserve:G) */ -+ -+ *id_out = id; -+ return true; -+} -+ -+/* Determine the end of a data block. */ -+static unsigned long get_next_lpos(struct prb_data_ring *data_ring, -+ unsigned long lpos, unsigned int size) -+{ -+ unsigned long begin_lpos; -+ unsigned long next_lpos; -+ -+ begin_lpos = lpos; -+ next_lpos = lpos + size; -+ -+ /* First check if the data block does not wrap. */ -+ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) -+ return next_lpos; -+ -+ /* Wrapping data blocks store their data at the beginning. */ -+ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); -+} -+ -+/* -+ * Allocate a new data block, invalidating the oldest data block(s) -+ * if necessary. This function also associates the data block with -+ * a specified descriptor. -+ */ -+static char *data_alloc(struct printk_ringbuffer *rb, -+ struct prb_data_ring *data_ring, unsigned int size, -+ struct prb_data_blk_lpos *blk_lpos, unsigned long id) -+{ -+ struct prb_data_block *blk; -+ unsigned long begin_lpos; -+ unsigned long next_lpos; -+ -+ if (size == 0) { -+ /* Specify a data-less block. */ -+ blk_lpos->begin = NO_LPOS; -+ blk_lpos->next = NO_LPOS; -+ return NULL; -+ } -+ -+ size = to_blk_size(size); -+ -+ begin_lpos = atomic_long_read(&data_ring->head_lpos); -+ -+ do { -+ next_lpos = get_next_lpos(data_ring, begin_lpos, size); -+ -+ if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { -+ /* Failed to allocate, specify a data-less block. */ -+ blk_lpos->begin = FAILED_LPOS; -+ blk_lpos->next = FAILED_LPOS; -+ return NULL; -+ } -+ -+ /* -+ * 1. Guarantee any descriptor states that have transitioned -+ * to reusable are stored before modifying the newly -+ * allocated data area. A full memory barrier is needed -+ * since other CPUs may have made the descriptor states -+ * reusable. See data_push_tail:A about why the reusable -+ * states are visible. This pairs with desc_read:D. -+ * -+ * 2. Guarantee any updated tail lpos is stored before -+ * modifying the newly allocated data area. Another CPU may -+ * be in data_make_reusable() and is reading a block ID -+ * from this area. data_make_reusable() can handle reading -+ * a garbage block ID value, but then it must be able to -+ * load a new tail lpos. A full memory barrier is needed -+ * since other CPUs may have updated the tail lpos. This -+ * pairs with data_push_tail:B. -+ */ -+ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, -+ next_lpos)); /* LMM(data_alloc:A) */ -+ -+ blk = to_block(data_ring, begin_lpos); -+ blk->id = id; /* LMM(data_alloc:B) */ -+ -+ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { -+ /* Wrapping data blocks store their data at the beginning. */ -+ blk = to_block(data_ring, 0); -+ -+ /* -+ * Store the ID on the wrapped block for consistency. -+ * The printk_ringbuffer does not actually use it. -+ */ -+ blk->id = id; -+ } -+ -+ blk_lpos->begin = begin_lpos; -+ blk_lpos->next = next_lpos; -+ -+ return &blk->data[0]; -+} -+ -+/* -+ * Try to resize an existing data block associated with the descriptor -+ * specified by @id. If the resized data block should become wrapped, it -+ * copies the old data to the new data block. If @size yields a data block -+ * with the same or less size, the data block is left as is. -+ * -+ * Fail if this is not the last allocated data block or if there is not -+ * enough space or it is not possible make enough space. -+ * -+ * Return a pointer to the beginning of the entire data buffer or NULL on -+ * failure. -+ */ -+static char *data_realloc(struct printk_ringbuffer *rb, -+ struct prb_data_ring *data_ring, unsigned int size, -+ struct prb_data_blk_lpos *blk_lpos, unsigned long id) -+{ -+ struct prb_data_block *blk; -+ unsigned long head_lpos; -+ unsigned long next_lpos; -+ bool wrapped; -+ -+ /* Reallocation only works if @blk_lpos is the newest data block. */ -+ head_lpos = atomic_long_read(&data_ring->head_lpos); -+ if (head_lpos != blk_lpos->next) -+ return NULL; -+ -+ /* Keep track if @blk_lpos was a wrapping data block. */ -+ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); -+ -+ size = to_blk_size(size); -+ -+ next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); -+ -+ /* If the data block does not increase, there is nothing to do. */ -+ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { -+ if (wrapped) -+ blk = to_block(data_ring, 0); -+ else -+ blk = to_block(data_ring, blk_lpos->begin); -+ return &blk->data[0]; -+ } -+ -+ if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) -+ return NULL; -+ -+ /* The memory barrier involvement is the same as data_alloc:A. */ -+ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, -+ next_lpos)) { /* LMM(data_realloc:A) */ -+ return NULL; -+ } -+ -+ blk = to_block(data_ring, blk_lpos->begin); -+ -+ if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { -+ struct prb_data_block *old_blk = blk; -+ -+ /* Wrapping data blocks store their data at the beginning. */ -+ blk = to_block(data_ring, 0); -+ -+ /* -+ * Store the ID on the wrapped block for consistency. -+ * The printk_ringbuffer does not actually use it. -+ */ -+ blk->id = id; -+ -+ if (!wrapped) { -+ /* -+ * Since the allocated space is now in the newly -+ * created wrapping data block, copy the content -+ * from the old data block. -+ */ -+ memcpy(&blk->data[0], &old_blk->data[0], -+ (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); -+ } -+ } -+ -+ blk_lpos->next = next_lpos; -+ -+ return &blk->data[0]; -+} -+ -+/* Return the number of bytes used by a data block. */ -+static unsigned int space_used(struct prb_data_ring *data_ring, -+ struct prb_data_blk_lpos *blk_lpos) -+{ -+ /* Data-less blocks take no space. */ -+ if (BLK_DATALESS(blk_lpos)) -+ return 0; -+ -+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { -+ /* Data block does not wrap. */ -+ return (DATA_INDEX(data_ring, blk_lpos->next) - -+ DATA_INDEX(data_ring, blk_lpos->begin)); -+ } -+ -+ /* -+ * For wrapping data blocks, the trailing (wasted) space is -+ * also counted. -+ */ -+ return (DATA_INDEX(data_ring, blk_lpos->next) + -+ DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); -+} -+ -+/* -+ * Given @blk_lpos, return a pointer to the writer data from the data block -+ * and calculate the size of the data part. A NULL pointer is returned if -+ * @blk_lpos specifies values that could never be legal. -+ * -+ * This function (used by readers) performs strict validation on the lpos -+ * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is -+ * triggered if an internal error is detected. -+ */ -+static const char *get_data(struct prb_data_ring *data_ring, -+ struct prb_data_blk_lpos *blk_lpos, -+ unsigned int *data_size) -+{ -+ struct prb_data_block *db; -+ -+ /* Data-less data block description. */ -+ if (BLK_DATALESS(blk_lpos)) { -+ if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { -+ *data_size = 0; -+ return ""; -+ } -+ return NULL; -+ } -+ -+ /* Regular data block: @begin less than @next and in same wrap. */ -+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && -+ blk_lpos->begin < blk_lpos->next) { -+ db = to_block(data_ring, blk_lpos->begin); -+ *data_size = blk_lpos->next - blk_lpos->begin; -+ -+ /* Wrapping data block: @begin is one wrap behind @next. */ -+ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == -+ DATA_WRAPS(data_ring, blk_lpos->next)) { -+ db = to_block(data_ring, 0); -+ *data_size = DATA_INDEX(data_ring, blk_lpos->next); -+ -+ /* Illegal block description. */ -+ } else { -+ WARN_ON_ONCE(1); -+ return NULL; -+ } -+ -+ /* A valid data block will always be aligned to the ID size. */ -+ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || -+ WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { -+ return NULL; -+ } -+ -+ /* A valid data block will always have at least an ID. */ -+ if (WARN_ON_ONCE(*data_size < sizeof(db->id))) -+ return NULL; -+ -+ /* Subtract block ID space from size to reflect data size. */ -+ *data_size -= sizeof(db->id); -+ -+ return &db->data[0]; -+} -+ -+/* -+ * Attempt to transition the newest descriptor from committed back to reserved -+ * so that the record can be modified by a writer again. This is only possible -+ * if the descriptor is not yet finalized and the provided @caller_id matches. -+ */ -+static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, -+ u32 caller_id, unsigned long *id_out) -+{ -+ unsigned long prev_state_val; -+ enum desc_state d_state; -+ struct prb_desc desc; -+ struct prb_desc *d; -+ unsigned long id; -+ u32 cid; -+ -+ id = atomic_long_read(&desc_ring->head_id); -+ -+ /* -+ * To reduce unnecessarily reopening, first check if the descriptor -+ * state and caller ID are correct. -+ */ -+ d_state = desc_read(desc_ring, id, &desc, NULL, &cid); -+ if (d_state != desc_committed || cid != caller_id) -+ return NULL; -+ -+ d = to_desc(desc_ring, id); -+ -+ prev_state_val = DESC_SV(id, desc_committed); -+ -+ /* -+ * Guarantee the reserved state is stored before reading any -+ * record data. A full memory barrier is needed because @state_var -+ * modification is followed by reading. This pairs with _prb_commit:B. -+ * -+ * Memory barrier involvement: -+ * -+ * If desc_reopen_last:A reads from _prb_commit:B, then -+ * prb_reserve_in_last:A reads from _prb_commit:A. -+ * -+ * Relies on: -+ * -+ * WMB from _prb_commit:A to _prb_commit:B -+ * matching -+ * MB If desc_reopen_last:A to prb_reserve_in_last:A -+ */ -+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, -+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ -+ return NULL; -+ } -+ -+ *id_out = id; -+ return d; -+} -+ -+/** -+ * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer -+ * used by the newest record. -+ * -+ * @e: The entry structure to setup. -+ * @rb: The ringbuffer to re-reserve and extend data in. -+ * @r: The record structure to allocate buffers for. -+ * @caller_id: The caller ID of the caller (reserving writer). -+ * @max_size: Fail if the extended size would be greater than this. -+ * -+ * This is the public function available to writers to re-reserve and extend -+ * data. -+ * -+ * The writer specifies the text size to extend (not the new total size) by -+ * setting the @text_buf_size field of @r. To ensure proper initialization -+ * of @r, prb_rec_init_wr() should be used. -+ * -+ * This function will fail if @caller_id does not match the caller ID of the -+ * newest record. In that case the caller must reserve new data using -+ * prb_reserve(). -+ * -+ * Context: Any context. Disables local interrupts on success. -+ * Return: true if text data could be extended, otherwise false. -+ * -+ * On success: -+ * -+ * - @r->text_buf points to the beginning of the entire text buffer. -+ * -+ * - @r->text_buf_size is set to the new total size of the buffer. -+ * -+ * - @r->info is not touched so that @r->info->text_len could be used -+ * to append the text. -+ * -+ * - prb_record_text_space() can be used on @e to query the new -+ * actually used space. -+ * -+ * Important: All @r->info fields will already be set with the current values -+ * for the record. I.e. @r->info->text_len will be less than -+ * @text_buf_size. Writers can use @r->info->text_len to know -+ * where concatenation begins and writers should update -+ * @r->info->text_len after concatenating. -+ */ -+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, -+ struct printk_record *r, u32 caller_id, unsigned int max_size) -+{ -+ struct prb_desc_ring *desc_ring = &rb->desc_ring; -+ struct printk_info *info; -+ unsigned int data_size; -+ struct prb_desc *d; -+ unsigned long id; -+ -+ local_irq_save(e->irqflags); -+ -+ /* Transition the newest descriptor back to the reserved state. */ -+ d = desc_reopen_last(desc_ring, caller_id, &id); -+ if (!d) { -+ local_irq_restore(e->irqflags); -+ goto fail_reopen; -+ } -+ -+ /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ -+ -+ info = to_info(desc_ring, id); -+ -+ /* -+ * Set the @e fields here so that prb_commit() can be used if -+ * anything fails from now on. -+ */ -+ e->rb = rb; -+ e->id = id; -+ -+ /* -+ * desc_reopen_last() checked the caller_id, but there was no -+ * exclusive access at that point. The descriptor may have -+ * changed since then. -+ */ -+ if (caller_id != info->caller_id) -+ goto fail; -+ -+ if (BLK_DATALESS(&d->text_blk_lpos)) { -+ if (WARN_ON_ONCE(info->text_len != 0)) { -+ pr_warn_once("wrong text_len value (%hu, expecting 0)\n", -+ info->text_len); -+ info->text_len = 0; -+ } -+ -+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) -+ goto fail; -+ -+ if (r->text_buf_size > max_size) -+ goto fail; -+ -+ r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, -+ &d->text_blk_lpos, id); -+ } else { -+ if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) -+ goto fail; -+ -+ /* -+ * Increase the buffer size to include the original size. If -+ * the meta data (@text_len) is not sane, use the full data -+ * block size. -+ */ -+ if (WARN_ON_ONCE(info->text_len > data_size)) { -+ pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", -+ info->text_len, data_size); -+ info->text_len = data_size; -+ } -+ r->text_buf_size += info->text_len; -+ -+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) -+ goto fail; -+ -+ if (r->text_buf_size > max_size) -+ goto fail; -+ -+ r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, -+ &d->text_blk_lpos, id); -+ } -+ if (r->text_buf_size && !r->text_buf) -+ goto fail; -+ -+ r->info = info; -+ -+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); -+ -+ return true; -+fail: -+ prb_commit(e); -+ /* prb_commit() re-enabled interrupts. */ -+fail_reopen: -+ /* Make it clear to the caller that the re-reserve failed. */ -+ memset(r, 0, sizeof(*r)); -+ return false; -+} -+ -+/* -+ * Attempt to finalize a specified descriptor. If this fails, the descriptor -+ * is either already final or it will finalize itself when the writer commits. -+ */ -+static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) -+{ -+ unsigned long prev_state_val = DESC_SV(id, desc_committed); -+ struct prb_desc *d = to_desc(desc_ring, id); -+ -+ atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, -+ DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ -+} -+ -+/** -+ * prb_reserve() - Reserve space in the ringbuffer. -+ * -+ * @e: The entry structure to setup. -+ * @rb: The ringbuffer to reserve data in. -+ * @r: The record structure to allocate buffers for. -+ * -+ * This is the public function available to writers to reserve data. -+ * -+ * The writer specifies the text size to reserve by setting the -+ * @text_buf_size field of @r. To ensure proper initialization of @r, -+ * prb_rec_init_wr() should be used. -+ * -+ * Context: Any context. Disables local interrupts on success. -+ * Return: true if at least text data could be allocated, otherwise false. -+ * -+ * On success, the fields @info and @text_buf of @r will be set by this -+ * function and should be filled in by the writer before committing. Also -+ * on success, prb_record_text_space() can be used on @e to query the actual -+ * space used for the text data block. -+ * -+ * Important: @info->text_len needs to be set correctly by the writer in -+ * order for data to be readable and/or extended. Its value -+ * is initialized to 0. -+ */ -+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, -+ struct printk_record *r) -+{ -+ struct prb_desc_ring *desc_ring = &rb->desc_ring; -+ struct printk_info *info; -+ struct prb_desc *d; -+ unsigned long id; -+ u64 seq; -+ -+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) -+ goto fail; -+ -+ /* -+ * Descriptors in the reserved state act as blockers to all further -+ * reservations once the desc_ring has fully wrapped. Disable -+ * interrupts during the reserve/commit window in order to minimize -+ * the likelihood of this happening. -+ */ -+ local_irq_save(e->irqflags); -+ -+ if (!desc_reserve(rb, &id)) { -+ /* Descriptor reservation failures are tracked. */ -+ atomic_long_inc(&rb->fail); -+ local_irq_restore(e->irqflags); -+ goto fail; -+ } -+ -+ d = to_desc(desc_ring, id); -+ info = to_info(desc_ring, id); -+ -+ /* -+ * All @info fields (except @seq) are cleared and must be filled in -+ * by the writer. Save @seq before clearing because it is used to -+ * determine the new sequence number. -+ */ -+ seq = info->seq; -+ memset(info, 0, sizeof(*info)); -+ -+ /* -+ * Set the @e fields here so that prb_commit() can be used if -+ * text data allocation fails. -+ */ -+ e->rb = rb; -+ e->id = id; -+ -+ /* -+ * Initialize the sequence number if it has "never been set". -+ * Otherwise just increment it by a full wrap. -+ * -+ * @seq is considered "never been set" if it has a value of 0, -+ * _except_ for @infos[0], which was specially setup by the ringbuffer -+ * initializer and therefore is always considered as set. -+ * -+ * See the "Bootstrap" comment block in printk_ringbuffer.h for -+ * details about how the initializer bootstraps the descriptors. -+ */ -+ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) -+ info->seq = DESC_INDEX(desc_ring, id); -+ else -+ info->seq = seq + DESCS_COUNT(desc_ring); -+ -+ /* -+ * New data is about to be reserved. Once that happens, previous -+ * descriptors are no longer able to be extended. Finalize the -+ * previous descriptor now so that it can be made available to -+ * readers. (For seq==0 there is no previous descriptor.) -+ */ -+ if (info->seq > 0) -+ desc_make_final(desc_ring, DESC_ID(id - 1)); -+ -+ r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, -+ &d->text_blk_lpos, id); -+ /* If text data allocation fails, a data-less record is committed. */ -+ if (r->text_buf_size && !r->text_buf) { -+ prb_commit(e); -+ /* prb_commit() re-enabled interrupts. */ -+ goto fail; -+ } -+ -+ r->info = info; -+ -+ /* Record full text space used by record. */ -+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); -+ -+ return true; -+fail: -+ /* Make it clear to the caller that the reserve failed. */ -+ memset(r, 0, sizeof(*r)); -+ return false; -+} -+ -+/* Commit the data (possibly finalizing it) and restore interrupts. */ -+static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) -+{ -+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring; -+ struct prb_desc *d = to_desc(desc_ring, e->id); -+ unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); -+ -+ /* Now the writer has finished all writing: LMM(_prb_commit:A) */ -+ -+ /* -+ * Set the descriptor as committed. See "ABA Issues" about why -+ * cmpxchg() instead of set() is used. -+ * -+ * 1 Guarantee all record data is stored before the descriptor state -+ * is stored as committed. A write memory barrier is sufficient -+ * for this. This pairs with desc_read:B and desc_reopen_last:A. -+ * -+ * 2. Guarantee the descriptor state is stored as committed before -+ * re-checking the head ID in order to possibly finalize this -+ * descriptor. This pairs with desc_reserve:D. -+ * -+ * Memory barrier involvement: -+ * -+ * If prb_commit:A reads from desc_reserve:D, then -+ * desc_make_final:A reads from _prb_commit:B. -+ * -+ * Relies on: -+ * -+ * MB _prb_commit:B to prb_commit:A -+ * matching -+ * MB desc_reserve:D to desc_make_final:A -+ */ -+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, -+ DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ -+ WARN_ON_ONCE(1); -+ } -+ -+ /* Restore interrupts, the reserve/commit window is finished. */ -+ local_irq_restore(e->irqflags); -+} -+ -+/** -+ * prb_commit() - Commit (previously reserved) data to the ringbuffer. -+ * -+ * @e: The entry containing the reserved data information. -+ * -+ * This is the public function available to writers to commit data. -+ * -+ * Note that the data is not yet available to readers until it is finalized. -+ * Finalizing happens automatically when space for the next record is -+ * reserved. -+ * -+ * See prb_final_commit() for a version of this function that finalizes -+ * immediately. -+ * -+ * Context: Any context. Enables local interrupts. -+ */ -+void prb_commit(struct prb_reserved_entry *e) -+{ -+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring; -+ unsigned long head_id; -+ -+ _prb_commit(e, desc_committed); -+ -+ /* -+ * If this descriptor is no longer the head (i.e. a new record has -+ * been allocated), extending the data for this record is no longer -+ * allowed and therefore it must be finalized. -+ */ -+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ -+ if (head_id != e->id) -+ desc_make_final(desc_ring, e->id); -+} -+ -+/** -+ * prb_final_commit() - Commit and finalize (previously reserved) data to -+ * the ringbuffer. -+ * -+ * @e: The entry containing the reserved data information. -+ * -+ * This is the public function available to writers to commit+finalize data. -+ * -+ * By finalizing, the data is made immediately available to readers. -+ * -+ * This function should only be used if there are no intentions of extending -+ * this data using prb_reserve_in_last(). -+ * -+ * Context: Any context. Enables local interrupts. -+ */ -+void prb_final_commit(struct prb_reserved_entry *e) -+{ -+ _prb_commit(e, desc_finalized); -+} -+ -+/* -+ * Count the number of lines in provided text. All text has at least 1 line -+ * (even if @text_size is 0). Each '\n' processed is counted as an additional -+ * line. -+ */ -+static unsigned int count_lines(const char *text, unsigned int text_size) -+{ -+ unsigned int next_size = text_size; -+ unsigned int line_count = 1; -+ const char *next = text; -+ -+ while (next_size) { -+ next = memchr(next, '\n', next_size); -+ if (!next) -+ break; -+ line_count++; -+ next++; -+ next_size = text_size - (next - text); -+ } -+ -+ return line_count; -+} -+ -+/* -+ * Given @blk_lpos, copy an expected @len of data into the provided buffer. -+ * If @line_count is provided, count the number of lines in the data. -+ * -+ * This function (used by readers) performs strict validation on the data -+ * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is -+ * triggered if an internal error is detected. -+ */ -+static bool copy_data(struct prb_data_ring *data_ring, -+ struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, -+ unsigned int buf_size, unsigned int *line_count) -+{ -+ unsigned int data_size; -+ const char *data; -+ -+ /* Caller might not want any data. */ -+ if ((!buf || !buf_size) && !line_count) -+ return true; -+ -+ data = get_data(data_ring, blk_lpos, &data_size); -+ if (!data) -+ return false; -+ -+ /* -+ * Actual cannot be less than expected. It can be more than expected -+ * because of the trailing alignment padding. -+ * -+ * Note that invalid @len values can occur because the caller loads -+ * the value during an allowed data race. -+ */ -+ if (data_size < (unsigned int)len) -+ return false; -+ -+ /* Caller interested in the line count? */ -+ if (line_count) -+ *line_count = count_lines(data, data_size); -+ -+ /* Caller interested in the data content? */ -+ if (!buf || !buf_size) -+ return true; -+ -+ data_size = min_t(u16, buf_size, len); -+ -+ memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ -+ return true; -+} -+ -+/* -+ * This is an extended version of desc_read(). It gets a copy of a specified -+ * descriptor. However, it also verifies that the record is finalized and has -+ * the sequence number @seq. On success, 0 is returned. -+ * -+ * Error return values: -+ * -EINVAL: A finalized record with sequence number @seq does not exist. -+ * -ENOENT: A finalized record with sequence number @seq exists, but its data -+ * is not available. This is a valid record, so readers should -+ * continue with the next record. -+ */ -+static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, -+ unsigned long id, u64 seq, -+ struct prb_desc *desc_out) -+{ -+ struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; -+ enum desc_state d_state; -+ u64 s; -+ -+ d_state = desc_read(desc_ring, id, desc_out, &s, NULL); -+ -+ /* -+ * An unexpected @id (desc_miss) or @seq mismatch means the record -+ * does not exist. A descriptor in the reserved or committed state -+ * means the record does not yet exist for the reader. -+ */ -+ if (d_state == desc_miss || -+ d_state == desc_reserved || -+ d_state == desc_committed || -+ s != seq) { -+ return -EINVAL; -+ } -+ -+ /* -+ * A descriptor in the reusable state may no longer have its data -+ * available; report it as existing but with lost data. Or the record -+ * may actually be a record with lost data. -+ */ -+ if (d_state == desc_reusable || -+ (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { -+ return -ENOENT; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Copy the ringbuffer data from the record with @seq to the provided -+ * @r buffer. On success, 0 is returned. -+ * -+ * See desc_read_finalized_seq() for error return values. -+ */ -+static int prb_read(struct printk_ringbuffer *rb, u64 seq, -+ struct printk_record *r, unsigned int *line_count) -+{ -+ struct prb_desc_ring *desc_ring = &rb->desc_ring; -+ struct printk_info *info = to_info(desc_ring, seq); -+ struct prb_desc *rdesc = to_desc(desc_ring, seq); -+ atomic_long_t *state_var = &rdesc->state_var; -+ struct prb_desc desc; -+ unsigned long id; -+ int err; -+ -+ /* Extract the ID, used to specify the descriptor to read. */ -+ id = DESC_ID(atomic_long_read(state_var)); -+ -+ /* Get a local copy of the correct descriptor (if available). */ -+ err = desc_read_finalized_seq(desc_ring, id, seq, &desc); -+ -+ /* -+ * If @r is NULL, the caller is only interested in the availability -+ * of the record. -+ */ -+ if (err || !r) -+ return err; -+ -+ /* If requested, copy meta data. */ -+ if (r->info) -+ memcpy(r->info, info, sizeof(*(r->info))); -+ -+ /* Copy text data. If it fails, this is a data-less record. */ -+ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, -+ r->text_buf, r->text_buf_size, line_count)) { -+ return -ENOENT; -+ } -+ -+ /* Ensure the record is still finalized and has the same @seq. */ -+ return desc_read_finalized_seq(desc_ring, id, seq, &desc); -+} -+ -+/* Get the sequence number of the tail descriptor. */ -+static u64 prb_first_seq(struct printk_ringbuffer *rb) -+{ -+ struct prb_desc_ring *desc_ring = &rb->desc_ring; -+ enum desc_state d_state; -+ struct prb_desc desc; -+ unsigned long id; -+ u64 seq; -+ -+ for (;;) { -+ id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ -+ -+ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ -+ -+ /* -+ * This loop will not be infinite because the tail is -+ * _always_ in the finalized or reusable state. -+ */ -+ if (d_state == desc_finalized || d_state == desc_reusable) -+ break; -+ -+ /* -+ * Guarantee the last state load from desc_read() is before -+ * reloading @tail_id in order to see a new tail in the case -+ * that the descriptor has been recycled. This pairs with -+ * desc_reserve:D. -+ * -+ * Memory barrier involvement: -+ * -+ * If prb_first_seq:B reads from desc_reserve:F, then -+ * prb_first_seq:A reads from desc_push_tail:B. -+ * -+ * Relies on: -+ * -+ * MB from desc_push_tail:B to desc_reserve:F -+ * matching -+ * RMB prb_first_seq:B to prb_first_seq:A -+ */ -+ smp_rmb(); /* LMM(prb_first_seq:C) */ -+ } -+ -+ return seq; -+} -+ -+/* -+ * Non-blocking read of a record. Updates @seq to the last finalized record -+ * (which may have no data available). -+ * -+ * See the description of prb_read_valid() and prb_read_valid_info() -+ * for details. -+ */ -+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, -+ struct printk_record *r, unsigned int *line_count) -+{ -+ u64 tail_seq; -+ int err; -+ -+ while ((err = prb_read(rb, *seq, r, line_count))) { -+ tail_seq = prb_first_seq(rb); -+ -+ if (*seq < tail_seq) { -+ /* -+ * Behind the tail. Catch up and try again. This -+ * can happen for -ENOENT and -EINVAL cases. -+ */ -+ *seq = tail_seq; -+ -+ } else if (err == -ENOENT) { -+ /* Record exists, but no data available. Skip. */ -+ (*seq)++; -+ -+ } else { -+ /* Non-existent/non-finalized record. Must stop. */ -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+/** -+ * prb_read_valid() - Non-blocking read of a requested record or (if gone) -+ * the next available record. -+ * -+ * @rb: The ringbuffer to read from. -+ * @seq: The sequence number of the record to read. -+ * @r: A record data buffer to store the read record to. -+ * -+ * This is the public function available to readers to read a record. -+ * -+ * The reader provides the @info and @text_buf buffers of @r to be -+ * filled in. Any of the buffer pointers can be set to NULL if the reader -+ * is not interested in that data. To ensure proper initialization of @r, -+ * prb_rec_init_rd() should be used. -+ * -+ * Context: Any context. -+ * Return: true if a record was read, otherwise false. -+ * -+ * On success, the reader must check r->info.seq to see which record was -+ * actually read. This allows the reader to detect dropped records. -+ * -+ * Failure means @seq refers to a not yet written record. -+ */ -+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, -+ struct printk_record *r) -+{ -+ return _prb_read_valid(rb, &seq, r, NULL); -+} -+ -+/** -+ * prb_read_valid_info() - Non-blocking read of meta data for a requested -+ * record or (if gone) the next available record. -+ * -+ * @rb: The ringbuffer to read from. -+ * @seq: The sequence number of the record to read. -+ * @info: A buffer to store the read record meta data to. -+ * @line_count: A buffer to store the number of lines in the record text. -+ * -+ * This is the public function available to readers to read only the -+ * meta data of a record. -+ * -+ * The reader provides the @info, @line_count buffers to be filled in. -+ * Either of the buffer pointers can be set to NULL if the reader is not -+ * interested in that data. -+ * -+ * Context: Any context. -+ * Return: true if a record's meta data was read, otherwise false. -+ * -+ * On success, the reader must check info->seq to see which record meta data -+ * was actually read. This allows the reader to detect dropped records. -+ * -+ * Failure means @seq refers to a not yet written record. -+ */ -+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, -+ struct printk_info *info, unsigned int *line_count) -+{ -+ struct printk_record r; -+ -+ prb_rec_init_rd(&r, info, NULL, 0); -+ -+ return _prb_read_valid(rb, &seq, &r, line_count); -+} -+ -+/** -+ * prb_first_valid_seq() - Get the sequence number of the oldest available -+ * record. -+ * -+ * @rb: The ringbuffer to get the sequence number from. -+ * -+ * This is the public function available to readers to see what the -+ * first/oldest valid sequence number is. -+ * -+ * This provides readers a starting point to begin iterating the ringbuffer. -+ * -+ * Context: Any context. -+ * Return: The sequence number of the first/oldest record or, if the -+ * ringbuffer is empty, 0 is returned. -+ */ -+u64 prb_first_valid_seq(struct printk_ringbuffer *rb) -+{ -+ u64 seq = 0; -+ -+ if (!_prb_read_valid(rb, &seq, NULL, NULL)) -+ return 0; -+ -+ return seq; -+} -+ -+/** -+ * prb_next_seq() - Get the sequence number after the last available record. -+ * -+ * @rb: The ringbuffer to get the sequence number from. -+ * -+ * This is the public function available to readers to see what the next -+ * newest sequence number available to readers will be. -+ * -+ * This provides readers a sequence number to jump to if all currently -+ * available records should be skipped. -+ * -+ * Context: Any context. -+ * Return: The sequence number of the next newest (not yet available) record -+ * for readers. -+ */ -+u64 prb_next_seq(struct printk_ringbuffer *rb) -+{ -+ u64 seq = 0; -+ -+ /* Search forward from the oldest descriptor. */ -+ while (_prb_read_valid(rb, &seq, NULL, NULL)) -+ seq++; -+ -+ return seq; -+} -+ -+/** -+ * prb_init() - Initialize a ringbuffer to use provided external buffers. -+ * -+ * @rb: The ringbuffer to initialize. -+ * @text_buf: The data buffer for text data. -+ * @textbits: The size of @text_buf as a power-of-2 value. -+ * @descs: The descriptor buffer for ringbuffer records. -+ * @descbits: The count of @descs items as a power-of-2 value. -+ * @infos: The printk_info buffer for ringbuffer records. -+ * -+ * This is the public function available to writers to setup a ringbuffer -+ * during runtime using provided buffers. -+ * -+ * This must match the initialization of DEFINE_PRINTKRB(). -+ * -+ * Context: Any context. -+ */ -+void prb_init(struct printk_ringbuffer *rb, -+ char *text_buf, unsigned int textbits, -+ struct prb_desc *descs, unsigned int descbits, -+ struct printk_info *infos) -+{ -+ memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); -+ memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); -+ -+ rb->desc_ring.count_bits = descbits; -+ rb->desc_ring.descs = descs; -+ rb->desc_ring.infos = infos; -+ atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); -+ atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); -+ -+ rb->text_data_ring.size_bits = textbits; -+ rb->text_data_ring.data = text_buf; -+ atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); -+ atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); -+ -+ atomic_long_set(&rb->fail, 0); -+ -+ atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); -+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; -+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; -+ -+ infos[0].seq = -(u64)_DESCS_COUNT(descbits); -+ infos[_DESCS_COUNT(descbits) - 1].seq = 0; -+} -+ -+/** -+ * prb_record_text_space() - Query the full actual used ringbuffer space for -+ * the text data of a reserved entry. -+ * -+ * @e: The successfully reserved entry to query. -+ * -+ * This is the public function available to writers to see how much actual -+ * space is used in the ringbuffer to store the text data of the specified -+ * entry. -+ * -+ * This function is only valid if @e has been successfully reserved using -+ * prb_reserve(). -+ * -+ * Context: Any context. -+ * Return: The size in bytes used by the text data of the associated record. -+ */ -+unsigned int prb_record_text_space(struct prb_reserved_entry *e) -+{ -+ return e->text_space; -+} -diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h -new file mode 100644 -index 000000000000..5dc9d022db07 ---- /dev/null -+++ b/kernel/printk/printk_ringbuffer.h -@@ -0,0 +1,382 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _KERNEL_PRINTK_RINGBUFFER_H -+#define _KERNEL_PRINTK_RINGBUFFER_H -+ -+#include <linux/atomic.h> -+#include <linux/dev_printk.h> -+ -+/* -+ * Meta information about each stored message. -+ * -+ * All fields are set by the printk code except for @seq, which is -+ * set by the ringbuffer code. -+ */ -+struct printk_info { -+ u64 seq; /* sequence number */ -+ u64 ts_nsec; /* timestamp in nanoseconds */ -+ u16 text_len; /* length of text message */ -+ u8 facility; /* syslog facility */ -+ u8 flags:5; /* internal record flags */ -+ u8 level:3; /* syslog level */ -+ u32 caller_id; /* thread id or processor id */ -+ -+ struct dev_printk_info dev_info; -+}; -+ -+/* -+ * A structure providing the buffers, used by writers and readers. -+ * -+ * Writers: -+ * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling -+ * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to -+ * buffers reserved for that writer. -+ * -+ * Readers: -+ * Using prb_rec_init_rd(), a reader sets all fields before calling -+ * prb_read_valid(). Note that the reader provides the @info and @text_buf, -+ * buffers. On success, the struct pointed to by @info will be filled and -+ * the char array pointed to by @text_buf will be filled with text data. -+ */ -+struct printk_record { -+ struct printk_info *info; -+ char *text_buf; -+ unsigned int text_buf_size; -+}; -+ -+/* Specifies the logical position and span of a data block. */ -+struct prb_data_blk_lpos { -+ unsigned long begin; -+ unsigned long next; -+}; -+ -+/* -+ * A descriptor: the complete meta-data for a record. -+ * -+ * @state_var: A bitwise combination of descriptor ID and descriptor state. -+ */ -+struct prb_desc { -+ atomic_long_t state_var; -+ struct prb_data_blk_lpos text_blk_lpos; -+}; -+ -+/* A ringbuffer of "ID + data" elements. */ -+struct prb_data_ring { -+ unsigned int size_bits; -+ char *data; -+ atomic_long_t head_lpos; -+ atomic_long_t tail_lpos; -+}; -+ -+/* A ringbuffer of "struct prb_desc" elements. */ -+struct prb_desc_ring { -+ unsigned int count_bits; -+ struct prb_desc *descs; -+ struct printk_info *infos; -+ atomic_long_t head_id; -+ atomic_long_t tail_id; -+}; -+ -+/* -+ * The high level structure representing the printk ringbuffer. -+ * -+ * @fail: Count of failed prb_reserve() calls where not even a data-less -+ * record was created. -+ */ -+struct printk_ringbuffer { -+ struct prb_desc_ring desc_ring; -+ struct prb_data_ring text_data_ring; -+ atomic_long_t fail; -+}; -+ -+/* -+ * Used by writers as a reserve/commit handle. -+ * -+ * @rb: Ringbuffer where the entry is reserved. -+ * @irqflags: Saved irq flags to restore on entry commit. -+ * @id: ID of the reserved descriptor. -+ * @text_space: Total occupied buffer space in the text data ring, including -+ * ID, alignment padding, and wrapping data blocks. -+ * -+ * This structure is an opaque handle for writers. Its contents are only -+ * to be used by the ringbuffer implementation. -+ */ -+struct prb_reserved_entry { -+ struct printk_ringbuffer *rb; -+ unsigned long irqflags; -+ unsigned long id; -+ unsigned int text_space; -+}; -+ -+/* The possible responses of a descriptor state-query. */ -+enum desc_state { -+ desc_miss = -1, /* ID mismatch (pseudo state) */ -+ desc_reserved = 0x0, /* reserved, in use by writer */ -+ desc_committed = 0x1, /* committed by writer, could get reopened */ -+ desc_finalized = 0x2, /* committed, no further modification allowed */ -+ desc_reusable = 0x3, /* free, not yet used by any writer */ -+}; -+ -+#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) -+#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) -+#define DESC_SV_BITS (sizeof(unsigned long) * 8) -+#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) -+#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) -+#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) -+#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) -+#define DESC_ID_MASK (~DESC_FLAGS_MASK) -+#define DESC_ID(sv) ((sv) & DESC_ID_MASK) -+#define FAILED_LPOS 0x1 -+#define NO_LPOS 0x3 -+ -+#define FAILED_BLK_LPOS \ -+{ \ -+ .begin = FAILED_LPOS, \ -+ .next = FAILED_LPOS, \ -+} -+ -+/* -+ * Descriptor Bootstrap -+ * -+ * The descriptor array is minimally initialized to allow immediate usage -+ * by readers and writers. The requirements that the descriptor array -+ * initialization must satisfy: -+ * -+ * Req1 -+ * The tail must point to an existing (committed or reusable) descriptor. -+ * This is required by the implementation of prb_first_seq(). -+ * -+ * Req2 -+ * Readers must see that the ringbuffer is initially empty. -+ * -+ * Req3 -+ * The first record reserved by a writer is assigned sequence number 0. -+ * -+ * To satisfy Req1, the tail initially points to a descriptor that is -+ * minimally initialized (having no data block, i.e. data-less with the -+ * data block's lpos @begin and @next values set to FAILED_LPOS). -+ * -+ * To satisfy Req2, the initial tail descriptor is initialized to the -+ * reusable state. Readers recognize reusable descriptors as existing -+ * records, but skip over them. -+ * -+ * To satisfy Req3, the last descriptor in the array is used as the initial -+ * head (and tail) descriptor. This allows the first record reserved by a -+ * writer (head + 1) to be the first descriptor in the array. (Only the first -+ * descriptor in the array could have a valid sequence number of 0.) -+ * -+ * The first time a descriptor is reserved, it is assigned a sequence number -+ * with the value of the array index. A "first time reserved" descriptor can -+ * be recognized because it has a sequence number of 0 but does not have an -+ * index of 0. (Only the first descriptor in the array could have a valid -+ * sequence number of 0.) After the first reservation, all future reservations -+ * (recycling) simply involve incrementing the sequence number by the array -+ * count. -+ * -+ * Hack #1 -+ * Only the first descriptor in the array is allowed to have the sequence -+ * number 0. In this case it is not possible to recognize if it is being -+ * reserved the first time (set to index value) or has been reserved -+ * previously (increment by the array count). This is handled by _always_ -+ * incrementing the sequence number by the array count when reserving the -+ * first descriptor in the array. In order to satisfy Req3, the sequence -+ * number of the first descriptor in the array is initialized to minus -+ * the array count. Then, upon the first reservation, it is incremented -+ * to 0, thus satisfying Req3. -+ * -+ * Hack #2 -+ * prb_first_seq() can be called at any time by readers to retrieve the -+ * sequence number of the tail descriptor. However, due to Req2 and Req3, -+ * initially there are no records to report the sequence number of -+ * (sequence numbers are u64 and there is nothing less than 0). To handle -+ * this, the sequence number of the initial tail descriptor is initialized -+ * to 0. Technically this is incorrect, because there is no record with -+ * sequence number 0 (yet) and the tail descriptor is not the first -+ * descriptor in the array. But it allows prb_read_valid() to correctly -+ * report the existence of a record for _any_ given sequence number at all -+ * times. Bootstrapping is complete when the tail is pushed the first -+ * time, thus finally pointing to the first descriptor reserved by a -+ * writer, which has the assigned sequence number 0. -+ */ -+ -+/* -+ * Initiating Logical Value Overflows -+ * -+ * Both logical position (lpos) and ID values can be mapped to array indexes -+ * but may experience overflows during the lifetime of the system. To ensure -+ * that printk_ringbuffer can handle the overflows for these types, initial -+ * values are chosen that map to the correct initial array indexes, but will -+ * result in overflows soon. -+ * -+ * BLK0_LPOS -+ * The initial @head_lpos and @tail_lpos for data rings. It is at index -+ * 0 and the lpos value is such that it will overflow on the first wrap. -+ * -+ * DESC0_ID -+ * The initial @head_id and @tail_id for the desc ring. It is at the last -+ * index of the descriptor array (see Req3 above) and the ID value is such -+ * that it will overflow on the second wrap. -+ */ -+#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) -+#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) -+#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) -+ -+/* -+ * Define a ringbuffer with an external text data buffer. The same as -+ * DEFINE_PRINTKRB() but requires specifying an external buffer for the -+ * text data. -+ * -+ * Note: The specified external buffer must be of the size: -+ * 2 ^ (descbits + avgtextbits) -+ */ -+#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ -+static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ -+ /* the initial head and tail */ \ -+ [_DESCS_COUNT(descbits) - 1] = { \ -+ /* reusable */ \ -+ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ -+ /* no associated data block */ \ -+ .text_blk_lpos = FAILED_BLK_LPOS, \ -+ }, \ -+}; \ -+static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ -+ /* this will be the first record reserved by a writer */ \ -+ [0] = { \ -+ /* will be incremented to 0 on the first reservation */ \ -+ .seq = -(u64)_DESCS_COUNT(descbits), \ -+ }, \ -+ /* the initial head and tail */ \ -+ [_DESCS_COUNT(descbits) - 1] = { \ -+ /* reports the first seq value during the bootstrap phase */ \ -+ .seq = 0, \ -+ }, \ -+}; \ -+static struct printk_ringbuffer name = { \ -+ .desc_ring = { \ -+ .count_bits = descbits, \ -+ .descs = &_##name##_descs[0], \ -+ .infos = &_##name##_infos[0], \ -+ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ -+ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ -+ }, \ -+ .text_data_ring = { \ -+ .size_bits = (avgtextbits) + (descbits), \ -+ .data = text_buf, \ -+ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ -+ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ -+ }, \ -+ .fail = ATOMIC_LONG_INIT(0), \ -+} -+ -+/** -+ * DEFINE_PRINTKRB() - Define a ringbuffer. -+ * -+ * @name: The name of the ringbuffer variable. -+ * @descbits: The number of descriptors as a power-of-2 value. -+ * @avgtextbits: The average text data size per record as a power-of-2 value. -+ * -+ * This is a macro for defining a ringbuffer and all internal structures -+ * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a -+ * variant where the text data buffer can be specified externally. -+ */ -+#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ -+static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ -+ __aligned(__alignof__(unsigned long)); \ -+_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) -+ -+/* Writer Interface */ -+ -+/** -+ * prb_rec_init_wd() - Initialize a buffer for writing records. -+ * -+ * @r: The record to initialize. -+ * @text_buf_size: The needed text buffer size. -+ */ -+static inline void prb_rec_init_wr(struct printk_record *r, -+ unsigned int text_buf_size) -+{ -+ r->info = NULL; -+ r->text_buf = NULL; -+ r->text_buf_size = text_buf_size; -+} -+ -+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, -+ struct printk_record *r); -+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, -+ struct printk_record *r, u32 caller_id, unsigned int max_size); -+void prb_commit(struct prb_reserved_entry *e); -+void prb_final_commit(struct prb_reserved_entry *e); -+ -+void prb_init(struct printk_ringbuffer *rb, -+ char *text_buf, unsigned int text_buf_size, -+ struct prb_desc *descs, unsigned int descs_count_bits, -+ struct printk_info *infos); -+unsigned int prb_record_text_space(struct prb_reserved_entry *e); -+ -+/* Reader Interface */ -+ -+/** -+ * prb_rec_init_rd() - Initialize a buffer for reading records. -+ * -+ * @r: The record to initialize. -+ * @info: A buffer to store record meta-data. -+ * @text_buf: A buffer to store text data. -+ * @text_buf_size: The size of @text_buf. -+ * -+ * Initialize all the fields that a reader is interested in. All arguments -+ * (except @r) are optional. Only record data for arguments that are -+ * non-NULL or non-zero will be read. -+ */ -+static inline void prb_rec_init_rd(struct printk_record *r, -+ struct printk_info *info, -+ char *text_buf, unsigned int text_buf_size) -+{ -+ r->info = info; -+ r->text_buf = text_buf; -+ r->text_buf_size = text_buf_size; -+} -+ -+/** -+ * prb_for_each_record() - Iterate over the records of a ringbuffer. -+ * -+ * @from: The sequence number to begin with. -+ * @rb: The ringbuffer to iterate over. -+ * @s: A u64 to store the sequence number on each iteration. -+ * @r: A printk_record to store the record on each iteration. -+ * -+ * This is a macro for conveniently iterating over a ringbuffer. -+ * Note that @s may not be the sequence number of the record on each -+ * iteration. For the sequence number, @r->info->seq should be checked. -+ * -+ * Context: Any context. -+ */ -+#define prb_for_each_record(from, rb, s, r) \ -+for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) -+ -+/** -+ * prb_for_each_info() - Iterate over the meta data of a ringbuffer. -+ * -+ * @from: The sequence number to begin with. -+ * @rb: The ringbuffer to iterate over. -+ * @s: A u64 to store the sequence number on each iteration. -+ * @i: A printk_info to store the record meta data on each iteration. -+ * @lc: An unsigned int to store the text line count of each record. -+ * -+ * This is a macro for conveniently iterating over a ringbuffer. -+ * Note that @s may not be the sequence number of the record on each -+ * iteration. For the sequence number, @r->info->seq should be checked. -+ * -+ * Context: Any context. -+ */ -+#define prb_for_each_info(from, rb, s, i, lc) \ -+for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) -+ -+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, -+ struct printk_record *r); -+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, -+ struct printk_info *info, unsigned int *line_count); -+ -+u64 prb_first_valid_seq(struct printk_ringbuffer *rb); -+u64 prb_next_seq(struct printk_ringbuffer *rb); -+ -+#endif /* _KERNEL_PRINTK_RINGBUFFER_H */ ++EXPORT_SYMBOL(pr_flush); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c deleted file mode 100644 -index 50aeae770434..000000000000 +index a0e6f746de6c..000000000000 --- a/kernel/printk/printk_safe.c +++ /dev/null @@ -1,414 +0,0 @@ @@ -17677,7 +13680,7 @@ index 50aeae770434..000000000000 - * is later flushed into the main ring buffer via IRQ work. - * - * The alternative implementation is chosen transparently -- * by examinig current printk() context mask stored in @printk_context +- * by examining current printk() context mask stored in @printk_context - * per-CPU variable. - * - * The implementation allows to flush the strings also from another CPU. @@ -18030,7 +14033,7 @@ index 50aeae770434..000000000000 - raw_spin_trylock(&logbuf_lock)) { - int len; - -- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); +- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - raw_spin_unlock(&logbuf_lock); - defer_console_output(); - return len; @@ -18068,7 +14071,7 @@ index 50aeae770434..000000000000 - printk_safe_flush(); -} diff --git a/kernel/ptrace.c b/kernel/ptrace.c -index 43d6179508d6..3075006d720e 100644 +index 61db50f7ca86..db33b5240e34 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -180,7 +180,14 @@ static bool ptrace_freeze_traced(struct task_struct *task) @@ -18087,11 +14090,48 @@ index 43d6179508d6..3075006d720e 100644 ret = true; } spin_unlock_irq(&task->sighand->siglock); +@@ -190,8 +197,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) + + static void ptrace_unfreeze_traced(struct task_struct *task) + { +- if (task->state != __TASK_TRACED) +- return; ++ unsigned long flags; ++ bool frozen = true; + + WARN_ON(!task->ptrace || task->parent != current); + +@@ -200,12 +207,19 @@ static void ptrace_unfreeze_traced(struct task_struct *task) + * Recheck state under the lock to close this race. + */ + spin_lock_irq(&task->sighand->siglock); +- if (task->state == __TASK_TRACED) { +- if (__fatal_signal_pending(task)) +- wake_up_state(task, __TASK_TRACED); +- else +- task->state = TASK_TRACED; +- } ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (task->state == __TASK_TRACED) ++ task->state = TASK_TRACED; ++ else if (task->saved_state == __TASK_TRACED) ++ task->saved_state = TASK_TRACED; ++ else ++ frozen = false; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ ++ if (frozen && __fatal_signal_pending(task)) ++ wake_up_state(task, __TASK_TRACED); ++ + spin_unlock_irq(&task->sighand->siglock); + } + diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig -index 0ebe15a84985..046974b22b8f 100644 +index cdc57b4f6d48..aa8cc8c977e7 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig -@@ -186,8 +186,8 @@ config RCU_FAST_NO_HZ +@@ -188,8 +188,8 @@ config RCU_FAST_NO_HZ config RCU_BOOST bool "Enable RCU priority boosting" @@ -18103,10 +14143,10 @@ index 0ebe15a84985..046974b22b8f 100644 This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c -index f453bf8d2f1e..a046e0c84db9 100644 +index 528ed10b78fd..e035508dd0f6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c -@@ -74,10 +74,13 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@ +@@ -61,10 +61,13 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@ #define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ #define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ #define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ @@ -18122,7 +14162,7 @@ index f453bf8d2f1e..a046e0c84db9 100644 #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ /* Must be power of two minus one. */ #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) -@@ -1246,31 +1249,53 @@ static void rcutorture_one_extend(int *readstate, int newstate, +@@ -1250,31 +1253,53 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); rtrsp->rt_readstate = newstate; @@ -18183,7 +14223,7 @@ index f453bf8d2f1e..a046e0c84db9 100644 if (statesold & RCUTORTURE_RDR_RCU) { bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); -@@ -1313,6 +1338,12 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) +@@ -1317,6 +1342,12 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) int mask = rcutorture_extend_mask_max(); unsigned long randmask1 = torture_random(trsp) >> 8; unsigned long randmask2 = randmask1 >> 3; @@ -18196,7 +14236,7 @@ index f453bf8d2f1e..a046e0c84db9 100644 WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); /* Mostly only one bit (need preemption!), sometimes lots of bits. */ -@@ -1320,11 +1351,49 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) +@@ -1324,11 +1355,49 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); @@ -18252,10 +14292,10 @@ index f453bf8d2f1e..a046e0c84db9 100644 } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index f78ee759af9c..367165074e5f 100644 +index 40e5e3dd253e..d60903581300 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c -@@ -113,8 +113,10 @@ static struct rcu_state rcu_state = { +@@ -100,8 +100,10 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ @@ -18268,10 +14308,10 @@ index f78ee759af9c..367165074e5f 100644 static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c -index 2de49b5d8dd2..294f7021a459 100644 +index 39334d2d2b37..b95ae86c40a7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c -@@ -69,8 +69,10 @@ +@@ -56,8 +56,10 @@ #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); @@ -18284,10 +14324,10 @@ index 2de49b5d8dd2..294f7021a459 100644 #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 2d95dc3f4644..2d54f1e7ef86 100644 +index ff74fca39ed2..2c678a0c77ad 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -63,7 +63,11 @@ const_debug unsigned int sysctl_sched_features = +@@ -64,7 +64,11 @@ const_debug unsigned int sysctl_sched_features = * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ @@ -18299,7 +14339,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 /* * period over which we measure -rt task CPU usage in us. -@@ -511,9 +515,15 @@ static bool set_nr_if_polling(struct task_struct *p) +@@ -504,9 +508,15 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif @@ -18317,7 +14357,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 /* * Atomically grab the task, if ->wake_q is !nil already it means -@@ -549,7 +559,13 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) +@@ -542,7 +552,13 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) */ void wake_q_add(struct wake_q_head *head, struct task_struct *task) { @@ -18332,7 +14372,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 get_task_struct(task); } -@@ -572,28 +588,39 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) +@@ -565,28 +581,39 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) */ void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) { @@ -18377,606 +14417,72 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 put_task_struct(task); } } -@@ -629,6 +656,48 @@ void resched_curr(struct rq *rq) - trace_sched_wake_idle_without_ipi(cpu); - } - -+#ifdef CONFIG_PREEMPT_LAZY -+ -+static int tsk_is_polling(struct task_struct *p) -+{ -+#ifdef TIF_POLLING_NRFLAG -+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); -+#else -+ return 0; -+#endif -+} -+ -+void resched_curr_lazy(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ if (!sched_feat(PREEMPT_LAZY)) { -+ resched_curr(rq); -+ return; -+ } -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ if (test_tsk_need_resched_lazy(curr)) -+ return; -+ -+ set_tsk_need_resched_lazy(curr); -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) -+ return; -+ -+ /* NEED_RESCHED_LAZY must be visible before we test polling */ -+ smp_mb(); -+ if (!tsk_is_polling(curr)) -+ smp_send_reschedule(cpu); -+} -+#endif -+ - void resched_cpu(int cpu) - { - struct rq *rq = cpu_rq(cpu); -@@ -1700,6 +1769,86 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) - - #ifdef CONFIG_SMP - -+#ifdef CONFIG_PREEMPT_RT -+ -+static void -+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); -+ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, -+ u32 flags); -+ -+static void migrate_disable_switch(struct rq *rq, struct task_struct *p) -+{ -+ if (likely(!p->migration_disabled)) -+ return; -+ -+ if (p->cpus_ptr != &p->cpus_mask) -+ return; -+ -+ /* -+ * Violates locking rules! see comment in __do_set_cpus_allowed(). -+ */ -+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); -+} -+ -+void migrate_disable(void) -+{ -+ struct task_struct *p = current; -+ -+ if (p->migration_disabled) { -+ p->migration_disabled++; -+ return; -+ } -+ -+ trace_sched_migrate_disable_tp(p); -+ -+ preempt_disable(); -+ this_rq()->nr_pinned++; -+ p->migration_disabled = 1; -+ preempt_lazy_disable(); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(migrate_disable); -+ -+void migrate_enable(void) -+{ -+ struct task_struct *p = current; -+ -+ if (p->migration_disabled > 1) { -+ p->migration_disabled--; -+ return; -+ } -+ -+ /* -+ * Ensure stop_task runs either before or after this, and that -+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). -+ */ -+ preempt_disable(); -+ if (p->cpus_ptr != &p->cpus_mask) -+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); -+ /* -+ * Mustn't clear migration_disabled() until cpus_ptr points back at the -+ * regular cpus_mask, otherwise things that race (eg. -+ * select_fallback_rq) get confused. -+ */ -+ barrier(); -+ p->migration_disabled = 0; -+ this_rq()->nr_pinned--; -+ preempt_lazy_enable(); -+ preempt_enable(); -+ -+ trace_sched_migrate_enable_tp(p); -+} -+EXPORT_SYMBOL_GPL(migrate_enable); -+ -+static inline bool rq_has_pinned_tasks(struct rq *rq) -+{ -+ return rq->nr_pinned; -+} -+ -+#endif -+ - /* - * Per-CPU kthreads are allowed to run on !active && online CPUs, see - * __set_cpus_allowed_ptr() and select_fallback_rq(). -@@ -1709,7 +1858,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - return false; - -- if (is_per_cpu_kthread(p)) -+ if (is_per_cpu_kthread(p) || is_migration_disabled(p)) - return cpu_online(cpu); - - return cpu_active(cpu); -@@ -1756,6 +1905,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, - struct migration_arg { - struct task_struct *task; - int dest_cpu; -+ struct completion *done; - }; - - /* -@@ -1790,6 +1940,7 @@ static int migration_cpu_stop(void *data) - struct migration_arg *arg = data; - struct task_struct *p = arg->task; - struct rq *rq = this_rq(); -+ bool complete = false; - struct rq_flags rf; - - /* -@@ -1812,15 +1963,70 @@ static int migration_cpu_stop(void *data) - * we're holding p->pi_lock. - */ - if (task_rq(p) == rq) { -+ if (is_migration_disabled(p)) -+ goto out; -+ - if (task_on_rq_queued(p)) - rq = __migrate_task(rq, &rf, p, arg->dest_cpu); - else - p->wake_cpu = arg->dest_cpu; -+ -+ if (arg->done) { -+ p->migration_pending = NULL; -+ complete = true; -+ } - } -+out: - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); -- - local_irq_enable(); -+ -+ if (complete) -+ complete_all(arg->done); -+ -+ return 0; -+} -+ -+int push_cpu_stop(void *arg) -+{ -+ struct rq *lowest_rq = NULL, *rq = this_rq(); -+ struct task_struct *p = arg; -+ -+ raw_spin_lock_irq(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ if (task_rq(p) != rq) -+ goto out_unlock; -+ -+ if (is_migration_disabled(p)) { -+ p->migration_flags |= MDF_PUSH; -+ goto out_unlock; -+ } -+ -+ p->migration_flags &= ~MDF_PUSH; -+ -+ if (p->sched_class->find_lock_rq) -+ lowest_rq = p->sched_class->find_lock_rq(p, rq); -+ -+ if (!lowest_rq) -+ goto out_unlock; -+ -+ // XXX validate p is still the highest prio task -+ if (task_rq(p) == rq) { -+ deactivate_task(rq, p, 0); -+ set_task_cpu(p, lowest_rq->cpu); -+ activate_task(lowest_rq, p, 0); -+ resched_curr(lowest_rq); -+ } -+ -+ double_unlock_balance(rq, lowest_rq); -+ -+out_unlock: -+ rq->push_busy = false; -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ put_task_struct(p); - return 0; - } - -@@ -1828,18 +2034,39 @@ static int migration_cpu_stop(void *data) - * sched_class::set_cpus_allowed must do the below, but is not required to - * actually call this function. - */ --void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) - { -+ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { -+ p->cpus_ptr = new_mask; -+ return; -+ } -+ - cpumask_copy(&p->cpus_mask, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); - } - --void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+static void -+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) - { - struct rq *rq = task_rq(p); - bool queued, running; - -- lockdep_assert_held(&p->pi_lock); -+ /* -+ * This here violates the locking rules for affinity, since we're only -+ * supposed to change these variables while holding both rq->lock and -+ * p->pi_lock. -+ * -+ * HOWEVER, it magically works, because ttwu() is the only code that -+ * accesses these variables under p->pi_lock and only does so after -+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() -+ * before finish_task(). -+ * -+ * XXX do further audits, this smells like something putrid. -+ */ -+ if (flags & SCA_MIGRATE_DISABLE) -+ SCHED_WARN_ON(!p->on_cpu); -+ else -+ lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current(rq, p); -@@ -1855,7 +2082,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) - if (running) - put_prev_task(rq, p); - -- p->sched_class->set_cpus_allowed(p, new_mask); -+ p->sched_class->set_cpus_allowed(p, new_mask, flags); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); -@@ -1863,6 +2090,208 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) - set_next_task(rq, p); - } +@@ -622,6 +649,48 @@ void resched_curr(struct rq *rq) + trace_sched_wake_idle_without_ipi(cpu); + } -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++#ifdef CONFIG_PREEMPT_LAZY ++ ++static int tsk_is_polling(struct task_struct *p) +{ -+ __do_set_cpus_allowed(p, new_mask, 0); ++#ifdef TIF_POLLING_NRFLAG ++ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); ++#else ++ return 0; ++#endif +} + -+struct set_affinity_pending { -+ refcount_t refs; -+ struct completion done; -+ struct cpu_stop_work stop_work; -+ struct migration_arg arg; -+}; -+ -+/* -+ * This function is wildly self concurrent; here be dragons. -+ * -+ * -+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the -+ * designated task is enqueued on an allowed CPU. If that task is currently -+ * running, we have to kick it out using the CPU stopper. -+ * -+ * Migrate-Disable comes along and tramples all over our nice sandcastle. -+ * Consider: -+ * -+ * Initial conditions: P0->cpus_mask = [0, 1] -+ * -+ * P0@CPU0 P1 -+ * -+ * migrate_disable(); -+ * <preempted> -+ * set_cpus_allowed_ptr(P0, [1]); -+ * -+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes -+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). -+ * This means we need the following scheme: -+ * -+ * P0@CPU0 P1 -+ * -+ * migrate_disable(); -+ * <preempted> -+ * set_cpus_allowed_ptr(P0, [1]); -+ * <blocks> -+ * <resumes> -+ * migrate_enable(); -+ * __set_cpus_allowed_ptr(); -+ * <wakes local stopper> -+ * `--> <woken on migration completion> -+ * -+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple -+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any -+ * task p are serialized by p->pi_lock, which we can leverage: the one that -+ * should come into effect at the end of the Migrate-Disable region is the last -+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), -+ * but we still need to properly signal those waiting tasks at the appropriate -+ * moment. -+ * -+ * This is implemented using struct set_affinity_pending. The first -+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will -+ * setup an instance of that struct and install it on the targeted task_struct. -+ * Any and all further callers will reuse that instance. Those then wait for -+ * a completion signaled at the tail of the CPU stopper callback (1), triggered -+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). -+ * -+ * -+ * (1) In the cases covered above. There is one more where the completion is -+ * signaled within affine_move_task() itself: when a subsequent affinity request -+ * cancels the need for an active migration. Consider: -+ * -+ * Initial conditions: P0->cpus_mask = [0, 1] -+ * -+ * P0@CPU0 P1 P2 -+ * -+ * migrate_disable(); -+ * <preempted> -+ * set_cpus_allowed_ptr(P0, [1]); -+ * <blocks> -+ * set_cpus_allowed_ptr(P0, [0, 1]); -+ * <signal completion> -+ * <awakes> -+ * -+ * Note that the above is safe vs a concurrent migrate_enable(), as any -+ * pending affinity completion is preceded an uninstallion of -+ * p->migration_pending done with p->pi_lock held. -+ */ -+static int affine_move_task(struct rq *rq, struct rq_flags *rf, -+ struct task_struct *p, int dest_cpu, unsigned int flags) ++void resched_curr_lazy(struct rq *rq) +{ -+ struct set_affinity_pending my_pending = { }, *pending = NULL; -+ struct migration_arg arg = { -+ .task = p, -+ .dest_cpu = dest_cpu, -+ }; -+ bool complete = false; -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { -+ struct task_struct *push_task = NULL; -+ -+ if ((flags & SCA_MIGRATE_ENABLE) && -+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) { -+ rq->push_busy = true; -+ push_task = get_task_struct(p); -+ } -+ -+ pending = p->migration_pending; -+ if (pending) { -+ refcount_inc(&pending->refs); -+ p->migration_pending = NULL; -+ complete = true; -+ } -+ task_rq_unlock(rq, p, rf); -+ -+ if (push_task) { -+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, -+ p, &rq->push_work); -+ } -+ -+ if (complete) -+ goto do_complete; -+ -+ return 0; -+ } -+ -+ if (!(flags & SCA_MIGRATE_ENABLE)) { -+ /* serialized by p->pi_lock */ -+ if (!p->migration_pending) { -+ /* Install the request */ -+ refcount_set(&my_pending.refs, 1); -+ init_completion(&my_pending.done); -+ p->migration_pending = &my_pending; -+ } else { -+ pending = p->migration_pending; -+ refcount_inc(&pending->refs); -+ } -+ } -+ pending = p->migration_pending; -+ /* -+ * - !MIGRATE_ENABLE: -+ * we'll have installed a pending if there wasn't one already. -+ * -+ * - MIGRATE_ENABLE: -+ * we're here because the current CPU isn't matching anymore, -+ * the only way that can happen is because of a concurrent -+ * set_cpus_allowed_ptr() call, which should then still be -+ * pending completion. -+ * -+ * Either way, we really should have a @pending here. -+ */ -+ if (WARN_ON_ONCE(!pending)) -+ return -EINVAL; -+ -+ arg.done = &pending->done; -+ -+ if (flags & SCA_MIGRATE_ENABLE) { -+ -+ p->migration_flags &= ~MDF_PUSH; -+ task_rq_unlock(rq, p, rf); -+ pending->arg = arg; -+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, -+ &pending->arg, &pending->stop_work); ++ struct task_struct *curr = rq->curr; ++ int cpu; + -+ return 0; ++ if (!sched_feat(PREEMPT_LAZY)) { ++ resched_curr(rq); ++ return; + } + -+ if (task_running(rq, p) || p->state == TASK_WAKING) { -+ /* -+ * Lessen races (and headaches) by delegating -+ * is_migration_disabled(p) checks to the stopper, which will -+ * run on the same CPU as said p. -+ */ -+ task_rq_unlock(rq, p, rf); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ -+ } else { -+ -+ if (!is_migration_disabled(p)) { -+ if (task_on_rq_queued(p)) -+ rq = move_queued_task(rq, rf, p, dest_cpu); -+ -+ p->migration_pending = NULL; -+ complete = true; -+ } -+ task_rq_unlock(rq, p, rf); ++ lockdep_assert_held(&rq->lock); + -+do_complete: -+ if (complete) -+ complete_all(&pending->done); -+ } ++ if (test_tsk_need_resched(curr)) ++ return; + -+ wait_for_completion(&pending->done); ++ if (test_tsk_need_resched_lazy(curr)) ++ return; + -+ if (refcount_dec_and_test(&pending->refs)) -+ wake_up_var(&pending->refs); ++ set_tsk_need_resched_lazy(curr); + -+ /* -+ * Block the original owner of &pending until all subsequent callers -+ * have seen the completion and decremented the refcount -+ */ -+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) ++ return; + -+ return 0; ++ /* NEED_RESCHED_LAZY must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(curr)) ++ smp_send_reschedule(cpu); +} ++#endif + - /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on -@@ -1873,7 +2302,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) - * call is not atomic; no spinlocks may be held. - */ - static int __set_cpus_allowed_ptr(struct task_struct *p, -- const struct cpumask *new_mask, bool check) -+ const struct cpumask *new_mask, -+ u32 flags) - { - const struct cpumask *cpu_valid_mask = cpu_active_mask; - unsigned int dest_cpu; -@@ -1884,9 +2314,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, - rq = task_rq_lock(p, &rf); - update_rq_clock(rq); - -- if (p->flags & PF_KTHREAD) { -+ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { - /* -- * Kernel threads are allowed on online && !active CPUs -+ * Kernel threads are allowed on online && !active CPUs. -+ * -+ * Specifically, migration_disabled() tasks must not fail the -+ * cpumask_any_and_distribute() pick below, esp. so on -+ * SCA_MIGRATE_ENABLE, otherwise we'll not call -+ * set_cpus_allowed_common() and actually reset p->cpus_ptr. - */ - cpu_valid_mask = cpu_online_mask; - } -@@ -1895,13 +2330,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, - * Must re-check here, to close a race against __kthread_bind(), - * sched_setaffinity() is not guaranteed to observe the flag. - */ -- if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { - ret = -EINVAL; - goto out; - } - -- if (cpumask_equal(&p->cpus_mask, new_mask)) -- goto out; -+ if (!(flags & SCA_MIGRATE_ENABLE)) { -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ if (WARN_ON_ONCE(p == current && -+ is_migration_disabled(p) && -+ !cpumask_test_cpu(task_cpu(p), new_mask))) { -+ ret = -EBUSY; -+ goto out; -+ } -+ } - - /* - * Picking a ~random cpu helps in cases where we are changing affinity -@@ -1914,7 +2358,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, - goto out; - } - -- do_set_cpus_allowed(p, new_mask); -+ __do_set_cpus_allowed(p, new_mask, flags); - - if (p->flags & PF_KTHREAD) { - /* -@@ -1926,23 +2370,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, - p->nr_cpus_allowed != 1); - } - -- /* Can the task run on the task's current CPU? If so, we're done */ -- if (cpumask_test_cpu(task_cpu(p), new_mask)) -- goto out; -+ return affine_move_task(rq, &rf, p, dest_cpu, flags); - -- if (task_running(rq, p) || p->state == TASK_WAKING) { -- struct migration_arg arg = { p, dest_cpu }; -- /* Need help from migration thread: drop lock and wait. */ -- task_rq_unlock(rq, p, &rf); -- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -- return 0; -- } else if (task_on_rq_queued(p)) { -- /* -- * OK, since we're going to drop the lock immediately -- * afterwards anyway. -- */ -- rq = move_queued_task(rq, &rf, p, dest_cpu); -- } - out: - task_rq_unlock(rq, p, &rf); - -@@ -1951,7 +2380,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, - - int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) + void resched_cpu(int cpu) { -- return __set_cpus_allowed_ptr(p, new_mask, false); -+ return __set_cpus_allowed_ptr(p, new_mask, 0); + struct rq *rq = cpu_rq(cpu); +@@ -1753,6 +1822,7 @@ void migrate_disable(void) + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; ++ preempt_lazy_disable(); + preempt_enable(); } - EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -@@ -1992,6 +2421,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) - * Clearly, migrating tasks to offline CPUs is a fairly daft thing. - */ - WARN_ON_ONCE(!cpu_online(new_cpu)); -+ -+ WARN_ON_ONCE(is_migration_disabled(p)); - #endif - - trace_sched_migrate_task(p, new_cpu); -@@ -2124,6 +2555,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, + EXPORT_SYMBOL_GPL(migrate_disable); +@@ -1781,6 +1851,7 @@ void migrate_enable(void) + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; ++ preempt_lazy_enable(); + preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_enable); +@@ -2573,6 +2644,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, } #endif /* CONFIG_NUMA_BALANCING */ @@ -18995,7 +14501,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 /* * wait_task_inactive - wait for a thread to unschedule. * -@@ -2168,7 +2611,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) +@@ -2617,7 +2700,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * is actually now running somewhere else! */ while (task_running(rq, p)) { @@ -19004,7 +14510,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 return 0; cpu_relax(); } -@@ -2183,7 +2626,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) +@@ -2632,7 +2715,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; @@ -19014,85 +14520,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); -@@ -2322,6 +2766,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) - } - fallthrough; - case possible: -+ /* -+ * XXX When called from select_task_rq() we only -+ * hold p->pi_lock and again violate locking order. -+ * -+ * More yuck to audit. -+ */ - do_set_cpus_allowed(p, cpu_possible_mask); - state = fail; - break; -@@ -2356,7 +2806,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) - { - lockdep_assert_held(&p->pi_lock); - -- if (p->nr_cpus_allowed > 1) -+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); - else - cpu = cpumask_any(p->cpus_ptr); -@@ -2379,6 +2829,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) - - void sched_set_stop_task(int cpu, struct task_struct *stop) - { -+ static struct lock_class_key stop_pi_lock; - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - -@@ -2394,6 +2845,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; -+ -+ /* -+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to -+ * adjust the effective priority of a task. As a result, -+ * rt_mutex_setprio() can trigger (RT) balancing operations, -+ * which can then trigger wakeups of the stop thread to push -+ * around the current task. -+ * -+ * The stop task itself will never be part of the PI-chain, it -+ * never blocks, therefore that ->pi_lock recursion is safe. -+ * Tell lockdep about this by placing the stop->pi_lock in its -+ * own class. -+ */ -+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock); - } - - cpu_rq(cpu)->stop = stop; -@@ -2410,13 +2875,25 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) - #else - - static inline int __set_cpus_allowed_ptr(struct task_struct *p, -- const struct cpumask *new_mask, bool check) -+ const struct cpumask *new_mask, -+ u32 flags) - { - return set_cpus_allowed_ptr(p, new_mask); - } - - #endif /* CONFIG_SMP */ - -+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) -+ -+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } -+ -+static inline bool rq_has_pinned_tasks(struct rq *rq) -+{ -+ return false; -+} -+ -+#endif -+ - static void - ttwu_stat(struct task_struct *p, int cpu, int wake_flags) - { -@@ -2828,7 +3305,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +@@ -3318,7 +3402,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) int cpu, success = 0; preempt_disable(); @@ -19101,7 +14529,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special -@@ -2858,8 +3335,26 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +@@ -3348,8 +3432,26 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); @@ -19118,157 +14546,47 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 + success = 1; + } + } - goto unlock; -+ } -+ /* -+ * If this is a regular wakeup, then we can unconditionally -+ * clear the saved state of a "lock sleeper". -+ */ -+ if (!(wake_flags & WF_LOCK_SLEEPER)) -+ p->saved_state = TASK_RUNNING; - - trace_sched_waking(p); - -@@ -3041,13 +3536,25 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t - * - * Return: 1 if the process was woken up, 0 if it was already running. - * -- * This function executes a full memory barrier before accessing the task state. -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+/** -+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" -+ * @p: The process to be woken up. -+ * -+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate -+ * the nature of the wakeup. - */ --int wake_up_process(struct task_struct *p) -+int wake_up_lock_sleeper(struct task_struct *p) - { -- return try_to_wake_up(p, TASK_NORMAL, 0); -+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER); - } --EXPORT_SYMBOL(wake_up_process); - - int wake_up_state(struct task_struct *p, unsigned int state) - { -@@ -3295,6 +3802,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - p->on_cpu = 0; - #endif - init_task_preempt_count(p); -+#ifdef CONFIG_HAVE_PREEMPT_LAZY -+ task_thread_info(p)->preempt_lazy_count = 0; -+#endif - #ifdef CONFIG_SMP - plist_node_init(&p->pushable_tasks, MAX_PRIO); - RB_CLEAR_NODE(&p->pushable_dl_tasks); -@@ -3489,6 +3999,90 @@ static inline void finish_task(struct task_struct *prev) - #endif - } - -+#ifdef CONFIG_SMP -+ -+static void do_balance_callbacks(struct rq *rq, struct callback_head *head) -+{ -+ void (*func)(struct rq *rq); -+ struct callback_head *next; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ while (head) { -+ func = (void (*)(struct rq *))head->func; -+ next = head->next; -+ head->next = NULL; -+ head = next; -+ -+ func(rq); -+ } -+} -+ -+static inline struct callback_head *splice_balance_callbacks(struct rq *rq) -+{ -+ struct callback_head *head = rq->balance_callback; -+ -+ lockdep_assert_held(&rq->lock); -+ if (head) { -+ rq->balance_callback = NULL; -+ rq->balance_flags &= ~BALANCE_WORK; -+ } -+ -+ return head; -+} -+ -+static void __balance_callbacks(struct rq *rq) -+{ -+ do_balance_callbacks(rq, splice_balance_callbacks(rq)); -+} -+ -+static inline void balance_callbacks(struct rq *rq, struct callback_head *head) -+{ -+ unsigned long flags; -+ -+ if (unlikely(head)) { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ do_balance_callbacks(rq, head); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+} -+ -+static void balance_push(struct rq *rq); -+ -+static inline void balance_switch(struct rq *rq) -+{ -+ if (likely(!rq->balance_flags)) -+ return; -+ -+ if (rq->balance_flags & BALANCE_PUSH) { -+ balance_push(rq); -+ return; -+ } -+ -+ __balance_callbacks(rq); -+} -+ -+#else -+ -+static inline void __balance_callbacks(struct rq *rq) -+{ -+} -+ -+static inline struct callback_head *splice_balance_callbacks(struct rq *rq) -+{ -+ return NULL; -+} -+ -+static inline void balance_callbacks(struct rq *rq, struct callback_head *head) -+{ -+} -+ -+static inline void balance_switch(struct rq *rq) + goto unlock; ++ } ++ /* ++ * If this is a regular wakeup, then we can unconditionally ++ * clear the saved state of a "lock sleeper". ++ */ ++ if (!(wake_flags & WF_LOCK_SLEEPER)) ++ p->saved_state = TASK_RUNNING; + + trace_sched_waking(p); + +@@ -3539,6 +3641,18 @@ int wake_up_process(struct task_struct *p) + } + EXPORT_SYMBOL(wake_up_process); + ++/** ++ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" ++ * @p: The process to be woken up. ++ * ++ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate ++ * the nature of the wakeup. ++ */ ++int wake_up_lock_sleeper(struct task_struct *p) +{ ++ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER); +} + -+#endif -+ - static inline void - prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) + int wake_up_state(struct task_struct *p, unsigned int state) { -@@ -3514,6 +4108,7 @@ static inline void finish_lock_switch(struct rq *rq) - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ balance_switch(rq); - raw_spin_unlock_irq(&rq->lock); - } - -@@ -3631,23 +4226,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) + return try_to_wake_up(p, state, 0); +@@ -3786,6 +3900,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->on_cpu = 0; + #endif + init_task_preempt_count(p); ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(p)->preempt_lazy_count = 0; ++#endif + #ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +@@ -4218,23 +4335,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) * provided by mmdrop(), * - a sync_core for SYNC_CORE. */ @@ -19297,59 +14615,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 put_task_struct_rcu_user(prev); } -@@ -3655,43 +4245,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) - return rq; - } - --#ifdef CONFIG_SMP -- --/* rq->lock is NOT held, but preemption is disabled */ --static void __balance_callback(struct rq *rq) --{ -- struct callback_head *head, *next; -- void (*func)(struct rq *rq); -- unsigned long flags; -- -- raw_spin_lock_irqsave(&rq->lock, flags); -- head = rq->balance_callback; -- rq->balance_callback = NULL; -- while (head) { -- func = (void (*)(struct rq *))head->func; -- next = head->next; -- head->next = NULL; -- head = next; -- -- func(rq); -- } -- raw_spin_unlock_irqrestore(&rq->lock, flags); --} -- --static inline void balance_callback(struct rq *rq) --{ -- if (unlikely(rq->balance_callback)) -- __balance_callback(rq); --} -- --#else -- --static inline void balance_callback(struct rq *rq) --{ --} -- --#endif -- - /** - * schedule_tail - first thing a freshly forked thread must call. - * @prev: the thread we just switched away from. -@@ -3711,7 +4264,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) - */ - - rq = finish_task_switch(prev); -- balance_callback(rq); - preempt_enable(); - - if (current->set_child_tid) -@@ -4406,7 +4958,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +@@ -4956,7 +5068,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ @@ -19358,7 +14624,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 { struct task_struct *prev, *next; unsigned long *switch_count; -@@ -4459,7 +5011,7 @@ static void __sched notrace __schedule(bool preempt) +@@ -5009,7 +5121,7 @@ static void __sched notrace __schedule(bool preempt) * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ prev_state = prev->state; @@ -19367,7 +14633,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { -@@ -4494,6 +5046,7 @@ static void __sched notrace __schedule(bool preempt) +@@ -5044,6 +5156,7 @@ static void __sched notrace __schedule(bool preempt) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); @@ -19375,30 +14641,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 clear_preempt_need_resched(); if (likely(prev != next)) { -@@ -4519,6 +5072,7 @@ static void __sched notrace __schedule(bool preempt) - */ - ++*switch_count; - -+ migrate_disable_switch(rq, prev); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - - trace_sched_switch(preempt, prev, next); -@@ -4527,10 +5081,11 @@ static void __sched notrace __schedule(bool preempt) - rq = context_switch(rq, prev, next, &rf); - } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); -- rq_unlock_irq(rq, &rf); -- } - -- balance_callback(rq); -+ rq_unpin_lock(rq, &rf); -+ __balance_callbacks(rq); -+ raw_spin_unlock_irq(&rq->lock); -+ } - } - - void __noreturn do_task_dead(void) -@@ -4541,7 +5096,7 @@ void __noreturn do_task_dead(void) +@@ -5093,7 +5206,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; @@ -19407,7 +14650,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -@@ -4571,9 +5126,6 @@ static inline void sched_submit_work(struct task_struct *tsk) +@@ -5126,9 +5239,6 @@ static inline void sched_submit_work(struct task_struct *tsk) preempt_enable_no_resched(); } @@ -19417,7 +14660,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. -@@ -4599,7 +5151,7 @@ asmlinkage __visible void __sched schedule(void) +@@ -5154,7 +5264,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); @@ -19426,7 +14669,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); -@@ -4627,7 +5179,7 @@ void __sched schedule_idle(void) +@@ -5182,7 +5292,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->state); do { @@ -19435,7 +14678,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 } while (need_resched()); } -@@ -4680,7 +5232,7 @@ static void __sched notrace preempt_schedule_common(void) +@@ -5235,7 +5345,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); @@ -19444,7 +14687,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 preempt_latency_stop(1); preempt_enable_no_resched_notrace(); -@@ -4691,6 +5243,30 @@ static void __sched notrace preempt_schedule_common(void) +@@ -5246,6 +5356,30 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } @@ -19475,7 +14718,7 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption -@@ -4704,12 +5280,26 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) +@@ -5259,12 +5393,26 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; @@ -19503,437 +14746,54 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 /** * preempt_schedule_notrace - preempt_schedule called by tracing * -@@ -4731,6 +5321,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) - if (likely(!preemptible())) - return; - -+ if (!preemptible_lazy()) -+ return; -+ - do { - /* - * Because the function tracer can trace preempt_count_sub() -@@ -4753,7 +5346,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) - * an infinite recursion. - */ - prev_ctx = exception_enter(); -- __schedule(true); -+ __schedule(true, false); - exception_exit(prev_ctx); - - preempt_latency_stop(1); -@@ -4782,7 +5375,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) - do { - preempt_disable(); - local_irq_enable(); -- __schedule(true); -+ __schedule(true, false); - local_irq_disable(); - sched_preempt_enable_no_resched(); - } while (need_resched()); -@@ -4938,9 +5531,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) - out_unlock: - /* Avoid rq from going away on us: */ - preempt_disable(); -- __task_rq_unlock(rq, &rf); - -- balance_callback(rq); -+ rq_unpin_lock(rq, &rf); -+ __balance_callbacks(rq); -+ raw_spin_unlock(&rq->lock); -+ - preempt_enable(); - } - #else -@@ -5214,6 +5809,7 @@ static int __sched_setscheduler(struct task_struct *p, - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; - const struct sched_class *prev_class; -+ struct callback_head *head; - struct rq_flags rf; - int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; -@@ -5452,6 +6048,7 @@ static int __sched_setscheduler(struct task_struct *p, - - /* Avoid rq from going away on us: */ - preempt_disable(); -+ head = splice_balance_callbacks(rq); - task_rq_unlock(rq, p, &rf); - - if (pi) { -@@ -5460,7 +6057,7 @@ static int __sched_setscheduler(struct task_struct *p, - } - - /* Run balance callbacks after we've adjusted the PI chain: */ -- balance_callback(rq); -+ balance_callbacks(rq, head); - preempt_enable(); - - return 0; -@@ -5955,7 +6552,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) - } - #endif - again: -- retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); -@@ -6538,7 +7135,7 @@ void init_idle(struct task_struct *idle, int cpu) - * - * And since this is boot we can forgo the serialization. - */ -- set_cpus_allowed_common(idle, cpumask_of(cpu)); -+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0); - #endif - /* - * We're having a chicken and egg problem, even though we are -@@ -6565,7 +7162,9 @@ void init_idle(struct task_struct *idle, int cpu) - - /* Set the preempt count _outside_ the spinlocks! */ - init_idle_preempt_count(idle, cpu); -- -+#ifdef CONFIG_HAVE_PREEMPT_LAZY -+ task_thread_info(idle)->preempt_lazy_count = 0; -+#endif - /* - * The idle tasks have their own, simple scheduling class: - */ -@@ -6670,6 +7269,7 @@ void sched_setnuma(struct task_struct *p, int nid) - #endif /* CONFIG_NUMA_BALANCING */ - - #ifdef CONFIG_HOTPLUG_CPU -+ - /* - * Ensure that the idle task is using init_mm right before its CPU goes - * offline. -@@ -6689,119 +7289,126 @@ void idle_task_exit(void) - /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ - } - --/* -- * Since this CPU is going 'away' for a while, fold any nr_active delta -- * we might have. Assumes we're called after migrate_tasks() so that the -- * nr_active count is stable. We need to take the teardown thread which -- * is calling this into account, so we hand in adjust = 1 to the load -- * calculation. -- * -- * Also see the comment "Global load-average calculations". -- */ --static void calc_load_migrate(struct rq *rq) -+static int __balance_push_cpu_stop(void *arg) - { -- long delta = calc_load_fold_active(rq, 1); -- if (delta) -- atomic_long_add(delta, &calc_load_tasks); --} -+ struct task_struct *p = arg; -+ struct rq *rq = this_rq(); -+ struct rq_flags rf; -+ int cpu; - --static struct task_struct *__pick_migrate_task(struct rq *rq) --{ -- const struct sched_class *class; -- struct task_struct *next; -+ raw_spin_lock_irq(&p->pi_lock); -+ rq_lock(rq, &rf); - -- for_each_class(class) { -- next = class->pick_next_task(rq); -- if (next) { -- next->sched_class->put_prev_task(rq, next); -- return next; -- } -+ update_rq_clock(rq); -+ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) { -+ cpu = select_fallback_rq(rq->cpu, p); -+ rq = __migrate_task(rq, &rf, p, cpu); - } - -- /* The idle class should always have a runnable task */ -- BUG(); -+ rq_unlock(rq, &rf); -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ put_task_struct(p); -+ -+ return 0; - } - -+static DEFINE_PER_CPU(struct cpu_stop_work, push_work); -+ - /* -- * Migrate all tasks from the rq, sleeping tasks will be migrated by -- * try_to_wake_up()->select_task_rq(). -- * -- * Called with rq->lock held even though we'er in stop_machine() and -- * there's no concurrency possible, we hold the required locks anyway -- * because of lock validation efforts. -+ * Ensure we only run per-cpu kthreads once the CPU goes !active. - */ --static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) -+static void balance_push(struct rq *rq) - { -- struct rq *rq = dead_rq; -- struct task_struct *next, *stop = rq->stop; -- struct rq_flags orf = *rf; -- int dest_cpu; -+ struct task_struct *push_task = rq->curr; -+ -+ lockdep_assert_held(&rq->lock); -+ SCHED_WARN_ON(rq->cpu != smp_processor_id()); - - /* -- * Fudge the rq selection such that the below task selection loop -- * doesn't get stuck on the currently eligible stop task. -- * -- * We're currently inside stop_machine() and the rq is either stuck -- * in the stop_machine_cpu_stop() loop, or we're executing this code, -- * either way we should never end up calling schedule() until we're -- * done here. -+ * Both the cpu-hotplug and stop task are in this case and are -+ * required to complete the hotplug process. - */ -- rq->stop = NULL; -+ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { -+ /* -+ * If this is the idle task on the outgoing CPU try to wake -+ * up the hotplug control thread which might wait for the -+ * last task to vanish. The rcuwait_active() check is -+ * accurate here because the waiter is pinned on this CPU -+ * and can't obviously be running in parallel. -+ * -+ * On RT kernels this also has to check whether there are -+ * pinned and scheduled out tasks on the runqueue. They -+ * need to leave the migrate disabled section first. -+ */ -+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && -+ rcuwait_active(&rq->hotplug_wait)) { -+ raw_spin_unlock(&rq->lock); -+ rcuwait_wake_up(&rq->hotplug_wait); -+ raw_spin_lock(&rq->lock); -+ } -+ return; -+ } - -+ get_task_struct(push_task); - /* -- * put_prev_task() and pick_next_task() sched -- * class method both need to have an up-to-date -- * value of rq->clock[_task] -+ * Temporarily drop rq->lock such that we can wake-up the stop task. -+ * Both preemption and IRQs are still disabled. - */ -- update_rq_clock(rq); -+ raw_spin_unlock(&rq->lock); -+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, -+ this_cpu_ptr(&push_work)); -+ /* -+ * At this point need_resched() is true and we'll take the loop in -+ * schedule(). The next pick is obviously going to be the stop task -+ * which is_per_cpu_kthread() and will push this task away. -+ */ -+ raw_spin_lock(&rq->lock); -+} - -- for (;;) { -- /* -- * There's this thread running, bail when that's the only -- * remaining thread: -- */ -- if (rq->nr_running == 1) -- break; -+static void balance_push_set(int cpu, bool on) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rq_lock_irqsave(rq, &rf); -+ if (on) -+ rq->balance_flags |= BALANCE_PUSH; -+ else -+ rq->balance_flags &= ~BALANCE_PUSH; -+ rq_unlock_irqrestore(rq, &rf); -+} - -- next = __pick_migrate_task(rq); -+/* -+ * Invoked from a CPUs hotplug control thread after the CPU has been marked -+ * inactive. All tasks which are not per CPU kernel threads are either -+ * pushed off this CPU now via balance_push() or placed on a different CPU -+ * during wakeup. Wait until the CPU is quiescent. -+ */ -+static void balance_hotplug_wait(void) -+{ -+ struct rq *rq = this_rq(); - -- /* -- * Rules for changing task_struct::cpus_mask are holding -- * both pi_lock and rq->lock, such that holding either -- * stabilizes the mask. -- * -- * Drop rq->lock is not quite as disastrous as it usually is -- * because !cpu_active at this point, which means load-balance -- * will not interfere. Also, stop-machine. -- */ -- rq_unlock(rq, rf); -- raw_spin_lock(&next->pi_lock); -- rq_relock(rq, rf); -+ rcuwait_wait_event(&rq->hotplug_wait, -+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq), -+ TASK_UNINTERRUPTIBLE); -+} - -- /* -- * Since we're inside stop-machine, _nothing_ should have -- * changed the task, WARN if weird stuff happened, because in -- * that case the above rq->lock drop is a fail too. -- */ -- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { -- raw_spin_unlock(&next->pi_lock); -- continue; -- } -+#else - -- /* Find suitable destination for @next, with force if needed. */ -- dest_cpu = select_fallback_rq(dead_rq->cpu, next); -- rq = __migrate_task(rq, rf, next, dest_cpu); -- if (rq != dead_rq) { -- rq_unlock(rq, rf); -- rq = dead_rq; -- *rf = orf; -- rq_relock(rq, rf); -- } -- raw_spin_unlock(&next->pi_lock); -- } -+static inline void balance_push(struct rq *rq) -+{ -+} -+ -+static inline void balance_push_set(int cpu, bool on) -+{ -+} - -- rq->stop = stop; -+static inline void balance_hotplug_wait(void) -+{ - } -+ - #endif /* CONFIG_HOTPLUG_CPU */ - - void set_rq_online(struct rq *rq) -@@ -6887,6 +7494,8 @@ int sched_cpu_activate(unsigned int cpu) - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - -+ balance_push_set(cpu, false); -+ - #ifdef CONFIG_SCHED_SMT - /* - * When going up, increment the number of cores with SMT present. -@@ -6922,6 +7531,8 @@ int sched_cpu_activate(unsigned int cpu) - - int sched_cpu_deactivate(unsigned int cpu) - { -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; - int ret; - - set_cpu_active(cpu, false); -@@ -6934,6 +7545,16 @@ int sched_cpu_deactivate(unsigned int cpu) - */ - synchronize_rcu(); - -+ balance_push_set(cpu, true); -+ -+ rq_lock_irqsave(rq, &rf); -+ if (rq->rd) { -+ update_rq_clock(rq); -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_offline(rq); -+ } -+ rq_unlock_irqrestore(rq, &rf); -+ - #ifdef CONFIG_SCHED_SMT - /* - * When going down, decrement the number of cores with SMT present. -@@ -6947,6 +7568,7 @@ int sched_cpu_deactivate(unsigned int cpu) - - ret = cpuset_cpu_inactive(cpu); - if (ret) { -+ balance_push_set(cpu, false); - set_cpu_active(cpu, true); - return ret; - } -@@ -6970,6 +7592,41 @@ int sched_cpu_starting(unsigned int cpu) - } - - #ifdef CONFIG_HOTPLUG_CPU -+ -+/* -+ * Invoked immediately before the stopper thread is invoked to bring the -+ * CPU down completely. At this point all per CPU kthreads except the -+ * hotplug thread (current) and the stopper thread (inactive) have been -+ * either parked or have been unbound from the outgoing CPU. Ensure that -+ * any of those which might be on the way out are gone. -+ * -+ * If after this point a bound task is being woken on this CPU then the -+ * responsible hotplug callback has failed to do it's job. -+ * sched_cpu_dying() will catch it with the appropriate fireworks. -+ */ -+int sched_cpu_wait_empty(unsigned int cpu) -+{ -+ balance_hotplug_wait(); -+ return 0; -+} -+ -+/* -+ * Since this CPU is going 'away' for a while, fold any nr_active delta we -+ * might have. Called from the CPU stopper task after ensuring that the -+ * stopper is the last running task on the CPU, so nr_active count is -+ * stable. We need to take the teardown thread which is calling this into -+ * account, so we hand in adjust = 1 to the load calculation. -+ * -+ * Also see the comment "Global load-average calculations". -+ */ -+static void calc_load_migrate(struct rq *rq) -+{ -+ long delta = calc_load_fold_active(rq, 1); -+ -+ if (delta) -+ atomic_long_add(delta, &calc_load_tasks); -+} -+ - int sched_cpu_dying(unsigned int cpu) - { - struct rq *rq = cpu_rq(cpu); -@@ -6979,12 +7636,7 @@ int sched_cpu_dying(unsigned int cpu) - sched_tick_stop(cpu); +@@ -5286,6 +5434,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + if (likely(!preemptible())) + return; - rq_lock_irqsave(rq, &rf); -- if (rq->rd) { -- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -- set_rq_offline(rq); -- } -- migrate_tasks(rq, &rf); -- BUG_ON(rq->nr_running != 1); -+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); - rq_unlock_irqrestore(rq, &rf); ++ if (!preemptible_lazy()) ++ return; ++ + do { + /* + * Because the function tracer can trace preempt_count_sub() +@@ -5308,7 +5459,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + * an infinite recursion. + */ + prev_ctx = exception_enter(); +- __schedule(true); ++ __schedule(true, false); + exception_exit(prev_ctx); - calc_load_migrate(rq); -@@ -7191,6 +7843,9 @@ void __init sched_init(void) + preempt_latency_stop(1); +@@ -5337,7 +5488,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) + do { + preempt_disable(); + local_irq_enable(); +- __schedule(true); ++ __schedule(true, false); + local_irq_disable(); + sched_preempt_enable_no_resched(); + } while (need_resched()); +@@ -7122,7 +7273,9 @@ void init_idle(struct task_struct *idle, int cpu) - rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); - #endif -+#ifdef CONFIG_HOTPLUG_CPU -+ rcuwait_init(&rq->hotplug_wait); + /* Set the preempt count _outside_ the spinlocks! */ + init_idle_preempt_count(idle, cpu); +- ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(idle)->preempt_lazy_count = 0; +#endif - #endif /* CONFIG_SMP */ - hrtick_rq_init(rq); - atomic_set(&rq->nr_iowait, 0); -@@ -7231,7 +7886,7 @@ void __init sched_init(void) + /* + * The idle tasks have their own, simple scheduling class: + */ +@@ -7227,6 +7380,7 @@ void sched_setnuma(struct task_struct *p, int nid) + #endif /* CONFIG_NUMA_BALANCING */ + + #ifdef CONFIG_HOTPLUG_CPU ++ + /* + * Ensure that the idle task is using init_mm right before its CPU goes + * offline. +@@ -7898,7 +8052,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { @@ -19942,191 +14802,33 @@ index 2d95dc3f4644..2d54f1e7ef86 100644 return (nested == preempt_offset); } -diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c -index 8cb06c8c7eb1..ceb03d76c0cc 100644 ---- a/kernel/sched/cpudeadline.c -+++ b/kernel/sched/cpudeadline.c -@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, - const struct sched_dl_entity *dl_se = &p->dl; - - if (later_mask && -- cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { -+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { - unsigned long cap, max_cap = 0; - int cpu, max_cpu = -1; - -@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, - - WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - -- if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && -+ if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && - dl_time_before(dl_se->deadline, cp->elements[0].dl)) { - if (later_mask) - cpumask_set_cpu(best_cpu, later_mask); -diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c -index 0033731a0797..11c4df2010de 100644 ---- a/kernel/sched/cpupri.c -+++ b/kernel/sched/cpupri.c -@@ -73,11 +73,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, - if (skip) - return 0; - -- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) -+ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) - return 0; - - if (lowest_mask) { -- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); -+ cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); - - /* - * We have to ensure that we have at least one bit -diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c -index 3862a28cd05d..6df71d487ed0 100644 ---- a/kernel/sched/deadline.c -+++ b/kernel/sched/deadline.c -@@ -543,7 +543,7 @@ static int push_dl_task(struct rq *rq); - - static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) - { -- return dl_task(prev); -+ return rq->online && dl_task(prev); - } - - static DEFINE_PER_CPU(struct callback_head, dl_push_head); -@@ -1888,7 +1888,7 @@ static void task_fork_dl(struct task_struct *p) - static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) - { - if (!task_running(rq, p) && -- cpumask_test_cpu(cpu, p->cpus_ptr)) -+ cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - return 0; - } -@@ -1978,8 +1978,8 @@ static int find_later_rq(struct task_struct *task) - return this_cpu; - } - -- best_cpu = cpumask_first_and(later_mask, -- sched_domain_span(sd)); -+ best_cpu = cpumask_any_and_distribute(later_mask, -+ sched_domain_span(sd)); - /* - * Last chance: if a CPU being in both later_mask - * and current sd span is valid, that becomes our -@@ -2001,7 +2001,7 @@ static int find_later_rq(struct task_struct *task) - if (this_cpu != -1) - return this_cpu; - -- cpu = cpumask_any(later_mask); -+ cpu = cpumask_any_distribute(later_mask); - if (cpu < nr_cpu_ids) - return cpu; - -@@ -2038,7 +2038,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) - /* Retry if something changed. */ - if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || -- !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || -+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || - !dl_task(task) || - !task_on_rq_queued(task))) { -@@ -2105,6 +2105,9 @@ static int push_dl_task(struct rq *rq) - return 0; - - retry: -+ if (is_migration_disabled(next_task)) -+ return 0; -+ - if (WARN_ON(next_task == rq->curr)) - return 0; +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 5f611658eeab..2c36a5fad589 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -60,7 +60,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; +- pc = preempt_count() - offset; ++ pc = irq_count() - offset; -@@ -2182,7 +2185,7 @@ static void push_dl_tasks(struct rq *rq) - static void pull_dl_task(struct rq *this_rq) - { - int this_cpu = this_rq->cpu, cpu; -- struct task_struct *p; -+ struct task_struct *p, *push_task; - bool resched = false; - struct rq *src_rq; - u64 dmin = LONG_MAX; -@@ -2212,6 +2215,7 @@ static void pull_dl_task(struct rq *this_rq) - continue; - - /* Might drop this_rq->lock */ -+ push_task = NULL; - double_lock_balance(this_rq, src_rq); - - /* -@@ -2243,17 +2247,28 @@ static void pull_dl_task(struct rq *this_rq) - src_rq->curr->dl.deadline)) - goto skip; - -- resched = true; -- -- deactivate_task(src_rq, p, 0); -- set_task_cpu(p, this_cpu); -- activate_task(this_rq, p, 0); -- dmin = p->dl.deadline; -+ if (is_migration_disabled(p)) { -+ trace_sched_migrate_pull_tp(p); -+ push_task = get_push_task(src_rq); -+ } else { -+ deactivate_task(src_rq, p, 0); -+ set_task_cpu(p, this_cpu); -+ activate_task(this_rq, p, 0); -+ dmin = p->dl.deadline; -+ resched = true; -+ } - - /* Is there any other task even earlier? */ - } - skip: - double_unlock_balance(this_rq, src_rq); -+ -+ if (push_task) { -+ raw_spin_unlock(&this_rq->lock); -+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, -+ push_task, &src_rq->push_work); -+ raw_spin_lock(&this_rq->lock); -+ } - } - - if (resched) -@@ -2277,7 +2292,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) - } + /* + * We do not account for softirq time from ksoftirqd here. +@@ -421,7 +421,7 @@ void vtime_task_switch(struct task_struct *prev) - static void set_cpus_allowed_dl(struct task_struct *p, -- const struct cpumask *new_mask) -+ const struct cpumask *new_mask, -+ u32 flags) + void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - struct root_domain *src_rd; - struct rq *rq; -@@ -2306,7 +2322,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, - raw_spin_unlock(&src_dl_b->lock); - } +- unsigned int pc = preempt_count() - offset; ++ unsigned int pc = irq_count() - offset; -- set_cpus_allowed_common(p, new_mask); -+ set_cpus_allowed_common(p, new_mask, flags); - } - - /* Assumes rq->lock is held */ -@@ -2499,6 +2515,7 @@ const struct sched_class dl_sched_class - .rq_online = rq_online_dl, - .rq_offline = rq_offline_dl, - .task_woken = task_woken_dl, -+ .find_lock_rq = find_lock_later_rq, - #endif - - .task_tick = task_tick_dl, + if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 1a68a0536add..d31aab136644 100644 +index 04a3ce20da67..2efe2b441a7d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -4357,7 +4357,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +@@ -4372,7 +4372,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { @@ -20135,7 +14837,7 @@ index 1a68a0536add..d31aab136644 100644 /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. -@@ -4381,7 +4381,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +@@ -4396,7 +4396,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) @@ -20144,7 +14846,7 @@ index 1a68a0536add..d31aab136644 100644 } static void -@@ -4524,7 +4524,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +@@ -4539,7 +4539,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { @@ -20153,7 +14855,7 @@ index 1a68a0536add..d31aab136644 100644 return; } /* -@@ -4661,7 +4661,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +@@ -4676,7 +4676,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) @@ -20162,7 +14864,7 @@ index 1a68a0536add..d31aab136644 100644 } static __always_inline -@@ -5396,7 +5396,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +@@ -5420,7 +5420,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) @@ -20171,7 +14873,7 @@ index 1a68a0536add..d31aab136644 100644 return; } hrtick_start(rq, delta); -@@ -6953,7 +6953,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7004,7 +7004,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: @@ -20180,7 +14882,7 @@ index 1a68a0536add..d31aab136644 100644 /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved -@@ -10694,7 +10694,7 @@ static void task_fork_fair(struct task_struct *p) +@@ -10781,7 +10781,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); @@ -20189,7 +14891,7 @@ index 1a68a0536add..d31aab136644 100644 } se->vruntime -= cfs_rq->min_vruntime; -@@ -10721,7 +10721,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +@@ -10808,7 +10808,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) @@ -20199,7 +14901,7 @@ index 1a68a0536add..d31aab136644 100644 check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 7481cd96f391..862e8b3c7d28 100644 +index 68d369cba9e4..5a2e27297126 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -45,11 +45,19 @@ SCHED_FEAT(DOUBLE_TICK, false) @@ -20222,348 +14924,19 @@ index 7481cd96f391..862e8b3c7d28 100644 /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. -diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c -index f215eea6a966..03f7b397716d 100644 ---- a/kernel/sched/rt.c -+++ b/kernel/sched/rt.c -@@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq); - static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) - { - /* Try to pull RT tasks here if we lower this rq's prio */ -- return rq->rt.highest_prio.curr > prev->prio; -+ return rq->online && rq->rt.highest_prio.curr > prev->prio; - } - - static inline int rt_overloaded(struct rq *rq) -@@ -1658,7 +1658,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) - static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) - { - if (!task_running(rq, p) && -- cpumask_test_cpu(cpu, p->cpus_ptr)) -+ cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - - return 0; -@@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_struct *task) - return this_cpu; - } - -- best_cpu = cpumask_first_and(lowest_mask, -- sched_domain_span(sd)); -+ best_cpu = cpumask_any_and_distribute(lowest_mask, -+ sched_domain_span(sd)); - if (best_cpu < nr_cpu_ids) { - rcu_read_unlock(); - return best_cpu; -@@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_struct *task) - if (this_cpu != -1) - return this_cpu; - -- cpu = cpumask_any(lowest_mask); -+ cpu = cpumask_any_distribute(lowest_mask); - if (cpu < nr_cpu_ids) - return cpu; - -@@ -1811,7 +1811,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) - * Also make sure that it wasn't scheduled on its rq. - */ - if (unlikely(task_rq(task) != rq || -- !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || -+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || - !rt_task(task) || - !task_on_rq_queued(task))) { -@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) - * running task can migrate over to a CPU that is running a task - * of lesser priority. - */ --static int push_rt_task(struct rq *rq) -+static int push_rt_task(struct rq *rq, bool pull) - { - struct task_struct *next_task; - struct rq *lowest_rq; -@@ -1873,6 +1873,39 @@ static int push_rt_task(struct rq *rq) - return 0; - - retry: -+ if (is_migration_disabled(next_task)) { -+ struct task_struct *push_task = NULL; -+ int cpu; -+ -+ if (!pull) -+ return 0; -+ -+ trace_sched_migrate_pull_tp(next_task); -+ -+ if (rq->push_busy) -+ return 0; -+ -+ cpu = find_lowest_rq(rq->curr); -+ if (cpu == -1 || cpu == rq->cpu) -+ return 0; -+ -+ /* -+ * Given we found a CPU with lower priority than @next_task, -+ * therefore it should be running. However we cannot migrate it -+ * to this other CPU, instead attempt to push the current -+ * running task on this CPU away. -+ */ -+ push_task = get_push_task(rq); -+ if (push_task) { -+ raw_spin_unlock(&rq->lock); -+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, -+ push_task, &rq->push_work); -+ raw_spin_lock(&rq->lock); -+ } -+ -+ return 0; -+ } -+ - if (WARN_ON(next_task == rq->curr)) - return 0; - -@@ -1927,12 +1960,10 @@ static int push_rt_task(struct rq *rq) - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); -- ret = 1; -- - resched_curr(lowest_rq); -+ ret = 1; - - double_unlock_balance(rq, lowest_rq); -- - out: - put_task_struct(next_task); - -@@ -1942,7 +1973,7 @@ static int push_rt_task(struct rq *rq) - static void push_rt_tasks(struct rq *rq) - { - /* push_rt_task will return true if it moved an RT */ -- while (push_rt_task(rq)) -+ while (push_rt_task(rq, false)) - ; - } - -@@ -2095,7 +2126,8 @@ void rto_push_irq_work_func(struct irq_work *work) - */ - if (has_pushable_tasks(rq)) { - raw_spin_lock(&rq->lock); -- push_rt_tasks(rq); -+ while (push_rt_task(rq, true)) -+ ; - raw_spin_unlock(&rq->lock); - } - -@@ -2120,7 +2152,7 @@ static void pull_rt_task(struct rq *this_rq) - { - int this_cpu = this_rq->cpu, cpu; - bool resched = false; -- struct task_struct *p; -+ struct task_struct *p, *push_task; - struct rq *src_rq; - int rt_overload_count = rt_overloaded(this_rq); - -@@ -2167,6 +2199,7 @@ static void pull_rt_task(struct rq *this_rq) - * double_lock_balance, and another CPU could - * alter this_rq - */ -+ push_task = NULL; - double_lock_balance(this_rq, src_rq); - - /* -@@ -2194,11 +2227,15 @@ static void pull_rt_task(struct rq *this_rq) - if (p->prio < src_rq->curr->prio) - goto skip; - -- resched = true; -- -- deactivate_task(src_rq, p, 0); -- set_task_cpu(p, this_cpu); -- activate_task(this_rq, p, 0); -+ if (is_migration_disabled(p)) { -+ trace_sched_migrate_pull_tp(p); -+ push_task = get_push_task(src_rq); -+ } else { -+ deactivate_task(src_rq, p, 0); -+ set_task_cpu(p, this_cpu); -+ activate_task(this_rq, p, 0); -+ resched = true; -+ } - /* - * We continue with the search, just in - * case there's an even higher prio task -@@ -2208,6 +2245,13 @@ static void pull_rt_task(struct rq *this_rq) - } - skip: - double_unlock_balance(this_rq, src_rq); -+ -+ if (push_task) { -+ raw_spin_unlock(&this_rq->lock); -+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, -+ push_task, &src_rq->push_work); -+ raw_spin_lock(&this_rq->lock); -+ } - } - - if (resched) -@@ -2449,6 +2493,7 @@ const struct sched_class rt_sched_class - .rq_offline = rq_offline_rt, - .task_woken = task_woken_rt, - .switched_from = switched_from_rt, -+ .find_lock_rq = find_lock_lowest_rq, - #endif - - .task_tick = task_tick_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 28709f6b0975..19847e4ae132 100644 +index bb09988451a0..b783aaf10cba 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -973,6 +973,7 @@ struct rq { - unsigned long cpu_capacity_orig; - - struct callback_head *balance_callback; -+ unsigned char balance_flags; - - unsigned char nohz_idle_balance; - unsigned char idle_balance; -@@ -1003,6 +1004,10 @@ struct rq { - - /* This is used to determine avg_idle's max value */ - u64 max_idle_balance_cost; -+ -+#ifdef CONFIG_HOTPLUG_CPU -+ struct rcuwait hotplug_wait; -+#endif - #endif /* CONFIG_SMP */ - - #ifdef CONFIG_IRQ_TIME_ACCOUNTING -@@ -1048,6 +1053,12 @@ struct rq { - /* Must be inspected within a rcu lock section */ - struct cpuidle_state *idle_state; - #endif -+ -+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) -+ unsigned int nr_pinned; -+#endif -+ unsigned int push_busy; -+ struct cpu_stop_work push_work; - }; - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -1075,6 +1086,16 @@ static inline int cpu_of(struct rq *rq) - #endif - } - -+#define MDF_PUSH 0x01 -+ -+static inline bool is_migration_disabled(struct task_struct *p) -+{ -+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) -+ return p->migration_disabled; -+#else -+ return false; -+#endif -+} - - #ifdef CONFIG_SCHED_SMT - extern void __update_idle_core(struct rq *rq); -@@ -1221,6 +1242,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) - rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); - rf->clock_update_flags = 0; - #endif -+#ifdef CONFIG_SMP -+ SCHED_WARN_ON(rq->balance_callback); -+#endif - } - - static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) -@@ -1382,6 +1406,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) - - #ifdef CONFIG_SMP - -+#define BALANCE_WORK 0x01 -+#define BALANCE_PUSH 0x02 -+ - static inline void - queue_balance_callback(struct rq *rq, - struct callback_head *head, -@@ -1389,12 +1416,13 @@ queue_balance_callback(struct rq *rq, - { - lockdep_assert_held(&rq->lock); +@@ -1750,6 +1750,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) + #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ + #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ + #define WF_ON_CPU 0x40 /* Wakee is on_cpu */ ++#define WF_LOCK_SLEEPER 0x80 /* Wakeup spinlock "sleeper" */ -- if (unlikely(head->next)) -+ if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) - return; - - head->func = (void (*)(struct callback_head *))func; - head->next = rq->balance_callback; - rq->balance_callback = head; -+ rq->balance_flags |= BALANCE_WORK; - } - - #define rcu_dereference_check_sched_domain(p) \ -@@ -1714,6 +1742,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) - #define WF_FORK 0x02 /* Child wakeup after fork */ - #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ - #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ -+#define WF_LOCK_SLEEPER 0x10 /* Wakeup spinlock "sleeper" */ - - /* - * To aid in avoiding the subversion of "niceness" due to uneven distribution -@@ -1795,10 +1824,13 @@ struct sched_class { - void (*task_woken)(struct rq *this_rq, struct task_struct *task); - - void (*set_cpus_allowed)(struct task_struct *p, -- const struct cpumask *newmask); -+ const struct cpumask *newmask, -+ u32 flags); - - void (*rq_online)(struct rq *rq); - void (*rq_offline)(struct rq *rq); -+ -+ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); - #endif - - void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); -@@ -1882,13 +1914,35 @@ static inline bool sched_fair_runnable(struct rq *rq) - extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - extern struct task_struct *pick_next_task_idle(struct rq *rq); - -+#define SCA_CHECK 0x01 -+#define SCA_MIGRATE_DISABLE 0x02 -+#define SCA_MIGRATE_ENABLE 0x04 -+ #ifdef CONFIG_SMP - - extern void update_group_capacity(struct sched_domain *sd, int cpu); - - extern void trigger_load_balance(struct rq *rq); - --extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); -+ -+static inline struct task_struct *get_push_task(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (rq->push_busy) -+ return NULL; -+ -+ if (p->nr_cpus_allowed == 1) -+ return NULL; -+ -+ rq->push_busy = true; -+ return get_task_struct(p); -+} -+ -+extern int push_cpu_stop(void *arg); - - #endif - -@@ -1932,6 +1986,15 @@ extern void reweight_task(struct task_struct *p, int prio); + static_assert(WF_EXEC == SD_BALANCE_EXEC); +@@ -2013,6 +2014,15 @@ extern void reweight_task(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -20592,19 +14965,21 @@ index e1c655f928c7..f230b1ac7f91 100644 list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 1bd7e3af904f..04ead8eeb07c 100644 +index 5d3675c7a76b..8c663da4ceb9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c -@@ -500,6 +500,7 @@ static int init_rootdomain(struct root_domain *rd) +@@ -526,7 +526,8 @@ static int init_rootdomain(struct root_domain *rd) + #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); -+ atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.flags); +- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); ++// init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); ++ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif - init_dl_bw(&rd->dl_bw); + rd->visit_gen = 0; diff --git a/kernel/signal.c b/kernel/signal.c -index a38b3edc6851..c3d7abd1f6ae 100644 +index 5ad8566534e7..e40ed99a62a1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -20,6 +20,7 @@ @@ -20615,24 +14990,24 @@ index a38b3edc6851..c3d7abd1f6ae 100644 #include <linux/file.h> #include <linux/fs.h> #include <linux/proc_fs.h> -@@ -403,13 +404,30 @@ void task_join_group_stop(struct task_struct *task) - } +@@ -404,13 +405,30 @@ void task_join_group_stop(struct task_struct *task) + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } -+static inline struct sigqueue *get_task_cache(struct task_struct *t) ++static struct sigqueue *sigqueue_from_cache(struct task_struct *t) +{ + struct sigqueue *q = t->sigqueue_cache; + -+ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) -+ return NULL; -+ return q; ++ if (q && cmpxchg(&t->sigqueue_cache, q, NULL) == q) ++ return q; ++ return NULL; +} + -+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) ++static bool sigqueue_add_cache(struct task_struct *t, struct sigqueue *q) +{ -+ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) -+ return 0; -+ return 1; ++ if (!t->sigqueue_cache && cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) ++ return true; ++ return false; +} + /* @@ -20643,23 +15018,23 @@ index a38b3edc6851..c3d7abd1f6ae 100644 static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, -+ int override_rlimit, int fromslab) ++ int override_rlimit, bool fromslab) { struct sigqueue *q = NULL; struct user_struct *user; -@@ -431,7 +449,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi +@@ -432,7 +450,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi rcu_read_unlock(); if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, flags); + if (!fromslab) -+ q = get_task_cache(t); ++ q = sigqueue_from_cache(t); + if (!q) + q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } -@@ -448,6 +469,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi +@@ -449,6 +470,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi return q; } @@ -20667,17 +15042,17 @@ index a38b3edc6851..c3d7abd1f6ae 100644 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, + int override_rlimit) +{ -+ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); ++ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, false); +} + static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) -@@ -457,6 +485,21 @@ static void __sigqueue_free(struct sigqueue *q) +@@ -458,6 +486,20 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } -+static void sigqueue_free_current(struct sigqueue *q) ++static void __sigqueue_cache_or_free(struct sigqueue *q) +{ + struct user_struct *up; + @@ -20685,17 +15060,16 @@ index a38b3edc6851..c3d7abd1f6ae 100644 + return; + + up = q->user; -+ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { -+ if (atomic_dec_and_test(&up->sigpending)) -+ free_uid(up); -+ } else -+ __sigqueue_free(q); ++ if (atomic_dec_and_test(&up->sigpending)) ++ free_uid(up); ++ if (!task_is_realtime(current) || !sigqueue_add_cache(current, q)) ++ kmem_cache_free(sigqueue_cachep, q); +} + void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; -@@ -469,6 +512,21 @@ void flush_sigqueue(struct sigpending *queue) +@@ -470,6 +512,21 @@ void flush_sigqueue(struct sigpending *queue) } } @@ -20709,7 +15083,7 @@ index a38b3edc6851..c3d7abd1f6ae 100644 + + flush_sigqueue(&tsk->pending); + -+ q = get_task_cache(tsk); ++ q = sigqueue_from_cache(tsk); + if (q) + kmem_cache_free(sigqueue_cachep, q); +} @@ -20717,16 +15091,16 @@ index a38b3edc6851..c3d7abd1f6ae 100644 /* * Flush all pending signals for this kthread. */ -@@ -593,7 +651,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i +@@ -594,7 +651,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i (info->si_code == SI_TIMER) && (info->si_sys_private); - __sigqueue_free(first); -+ sigqueue_free_current(first); ++ __sigqueue_cache_or_free(first); } else { /* * Ok, it wasn't in the queue. This must be -@@ -630,6 +688,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in +@@ -631,6 +688,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in bool resched_timer = false; int signr; @@ -20735,7 +15109,7 @@ index a38b3edc6851..c3d7abd1f6ae 100644 /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ -@@ -1313,6 +1373,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) +@@ -1314,6 +1373,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) struct k_sigaction *action; int sig = info->si_signo; @@ -20770,17 +15144,16 @@ index a38b3edc6851..c3d7abd1f6ae 100644 spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; -@@ -1806,7 +1894,8 @@ EXPORT_SYMBOL(kill_pid); +@@ -1807,7 +1894,7 @@ EXPORT_SYMBOL(kill_pid); */ struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); -+ /* Preallocated sigqueue objects always from the slabcache ! */ -+ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); ++ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, true); if (q) q->flags |= SIGQUEUE_PREALLOC; -@@ -2202,16 +2291,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t +@@ -2203,16 +2290,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); @@ -20797,8 +15170,40 @@ index a38b3edc6851..c3d7abd1f6ae 100644 freezable_schedule(); cgroup_leave_frozen(true); } else { +diff --git a/kernel/smp.c b/kernel/smp.c +index 1b6070bf97bb..01e9d01d1866 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -14,6 +14,7 @@ + #include <linux/export.h> + #include <linux/percpu.h> + #include <linux/init.h> ++#include <linux/interrupt.h> + #include <linux/gfp.h> + #include <linux/smp.h> + #include <linux/cpu.h> +@@ -449,6 +450,19 @@ void flush_smp_call_function_from_idle(void) + + local_irq_save(flags); + flush_smp_call_function_queue(true); ++ ++ if (local_softirq_pending()) { ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ do_softirq(); ++ } else { ++ struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); ++ ++ if (ksoftirqd && ksoftirqd->state != TASK_RUNNING) ++ wake_up_process(ksoftirqd); ++ } ++ } ++ + local_irq_restore(flags); + } + diff --git a/kernel/softirq.c b/kernel/softirq.c -index bf88d7f62433..102f5553884c 100644 +index 9d71046ea247..a9b66aa08636 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -13,6 +13,7 @@ @@ -20809,17 +15214,17 @@ index bf88d7f62433..102f5553884c 100644 #include <linux/mm.h> #include <linux/notifier.h> #include <linux/percpu.h> -@@ -92,15 +93,225 @@ static bool ksoftirqd_running(unsigned long pending) - !__kthread_should_park(tsk); - } +@@ -25,6 +26,7 @@ + #include <linux/smpboot.h> + #include <linux/tick.h> + #include <linux/irq.h> ++#include <linux/wait_bit.h> + + #define CREATE_TRACE_POINTS + #include <trace/events/irq.h> +@@ -100,20 +102,204 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + #endif -+#ifdef CONFIG_TRACE_IRQFLAGS -+DEFINE_PER_CPU(int, hardirqs_enabled); -+DEFINE_PER_CPU(int, hardirq_context); -+EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); -+EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); -+#endif -+ /* - * preempt_count and SOFTIRQ_OFFSET usage: - * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -20874,7 +15279,7 @@ index bf88d7f62433..102f5553884c 100644 + */ +bool local_bh_blocked(void) +{ -+ return this_cpu_read(softirq_ctrl.cnt) != 0; ++ return __this_cpu_read(softirq_ctrl.cnt) != 0; +} + +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) @@ -20882,24 +15287,24 @@ index bf88d7f62433..102f5553884c 100644 + unsigned long flags; + int newcnt; + -+ WARN_ON_ONCE(in_irq()); ++ WARN_ON_ONCE(in_hardirq()); + + /* First entry of a task into a BH disabled section? */ + if (!current->softirq_disable_cnt) { + if (preemptible()) { + local_lock(&softirq_ctrl.lock); ++ /* Required to meet the RCU bottomhalf requirements. */ + rcu_read_lock(); + } else { + DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt)); + } + } + -+ preempt_disable(); + /* + * Track the per CPU softirq disabled state. On RT this is per CPU + * state to allow preemption of bottom half disabled sections. + */ -+ newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt); ++ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); + /* + * Reflect the result in the task state to prevent recursion on the + * local lock and to make softirq_count() & al work. @@ -20911,7 +15316,6 @@ index bf88d7f62433..102f5553884c 100644 + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } -+ preempt_enable(); +} +EXPORT_SYMBOL(__local_bh_disable_ip); + @@ -20923,16 +15327,14 @@ index bf88d7f62433..102f5553884c 100644 + DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != + this_cpu_read(softirq_ctrl.cnt)); + -+ preempt_disable(); + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_on(_RET_IP_); + raw_local_irq_restore(flags); + } + -+ newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt); ++ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt = newcnt; -+ preempt_enable(); + + if (!newcnt && unlock) { + rcu_read_unlock(); @@ -20951,7 +15353,7 @@ index bf88d7f62433..102f5553884c 100644 + lockdep_assert_irqs_enabled(); + + local_irq_save(flags); -+ curcnt = this_cpu_read(softirq_ctrl.cnt); ++ curcnt = __this_cpu_read(softirq_ctrl.cnt); + + /* + * If this is not reenabling soft interrupts, no point in trying to @@ -20981,29 +15383,15 @@ index bf88d7f62433..102f5553884c 100644 + __local_bh_enable(cnt, false); + __do_softirq(); + -+out: -+ __local_bh_enable(cnt, preempt_on); -+ local_irq_restore(flags); -+} -+EXPORT_SYMBOL(__local_bh_enable_ip); -+ -+/* -+ * Invoked from irq_enter_rcu() to prevent that tick_irq_enter() -+ * pointlessly wakes the softirq daemon. That's handled in __irq_exit_rcu(). -+ * None of the above logic in the regular bh_disable/enable functions is -+ * required here. -+ */ -+static inline void local_bh_disable_irq_enter(void) -+{ -+ this_cpu_add(softirq_ctrl.cnt, SOFTIRQ_DISABLE_OFFSET); -+} -+ -+static inline void local_bh_enable_irq_enter(void) -+{ -+ this_cpu_sub(softirq_ctrl.cnt, SOFTIRQ_DISABLE_OFFSET); -+} -+ -+/* ++out: ++ __local_bh_enable(cnt, preempt_on); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL(__local_bh_enable_ip); + +-#ifdef CONFIG_TRACE_IRQFLAGS + /* +- * This is for softirq.c-internal use, where hardirqs are disabled + * Invoked from ksoftirqd_run() outside of the interrupt disabled section + * to acquire the per CPU local lock for reentrancy protection. + */ @@ -21024,47 +15412,31 @@ index bf88d7f62433..102f5553884c 100644 +static inline void softirq_handle_begin(void) { } +static inline void softirq_handle_end(void) { } + -+static inline void invoke_softirq(void) ++static inline bool should_wake_ksoftirqd(void) +{ -+ if (!this_cpu_read(softirq_ctrl.cnt)) -+ wakeup_softirqd(); ++ return !this_cpu_read(softirq_ctrl.cnt); +} + -+static inline bool should_wake_ksoftirqd(void) ++static inline void invoke_softirq(void) +{ -+ return !this_cpu_read(softirq_ctrl.cnt); ++ if (should_wake_ksoftirqd()) ++ wakeup_softirqd(); +} + +#else /* CONFIG_PREEMPT_RT */ - - /* - * This one is for softirq.c-internal use, -@@ -108,11 +319,6 @@ static bool ksoftirqd_running(unsigned long pending) ++ ++/* ++ * This one is for softirq.c-internal use, where hardirqs are disabled + * legitimately: */ - #ifdef CONFIG_TRACE_IRQFLAGS - --DEFINE_PER_CPU(int, hardirqs_enabled); --DEFINE_PER_CPU(int, hardirq_context); --EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); --EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); -- ++#ifdef CONFIG_TRACE_IRQFLAGS void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; -@@ -203,6 +409,67 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) +@@ -204,6 +390,32 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) } EXPORT_SYMBOL(__local_bh_enable_ip); -+static inline void local_bh_disable_irq_enter(void) -+{ -+ local_bh_disable(); -+} -+ -+static inline void local_bh_enable_irq_enter(void) -+{ -+ _local_bh_enable(); -+} -+ +static inline void softirq_handle_begin(void) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); @@ -21086,49 +15458,33 @@ index bf88d7f62433..102f5553884c 100644 + local_irq_enable(); +} + -+static inline void invoke_softirq(void) ++static inline bool should_wake_ksoftirqd(void) +{ -+ if (ksoftirqd_running(local_softirq_pending())) -+ return; -+ -+ if (!force_irqthreads) { -+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK -+ /* -+ * We can safely execute softirq on the current stack if -+ * it is the irq stack, because it should be near empty -+ * at this stage. -+ */ -+ __do_softirq(); -+#else -+ /* -+ * Otherwise, irq_exit() is called on the task stack that can -+ * be potentially deep already. So call softirq in its own stack -+ * to prevent from any overrun. -+ */ -+ do_softirq_own_stack(); -+#endif -+ } else { -+ wakeup_softirqd(); -+ } ++ return true; +} + -+static inline bool should_wake_ksoftirqd(void) { return true; } -+ + static inline void invoke_softirq(void) + { + if (ksoftirqd_running(local_softirq_pending())) +@@ -248,6 +460,8 @@ asmlinkage __visible void do_softirq(void) + local_irq_restore(flags); + } + +#endif /* !CONFIG_PREEMPT_RT */ + /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, * but break the loop if need_resched() is set or after 2 ms. -@@ -272,7 +539,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) +@@ -316,7 +530,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + pending = local_softirq_pending(); - account_irq_enter_time(current); - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + softirq_handle_begin(); in_hardirq = lockdep_softirq_start(); + account_softirq_enter(current); - restart: -@@ -307,8 +574,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) +@@ -352,8 +566,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) pending >>= softirq_bit; } @@ -21140,74 +15496,17 @@ index bf88d7f62433..102f5553884c 100644 local_irq_disable(); pending = local_softirq_pending(); -@@ -322,11 +591,11 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) +@@ -367,8 +583,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + account_softirq_exit(current); lockdep_softirq_end(in_hardirq); - account_irq_exit_time(current); - __local_bh_enable(SOFTIRQ_OFFSET); - WARN_ON_ONCE(in_interrupt()); + softirq_handle_end(); current_restore_flags(old_flags, PF_MEMALLOC); } -+#ifndef CONFIG_PREEMPT_RT - asmlinkage __visible void do_softirq(void) - { - __u32 pending; -@@ -344,6 +613,7 @@ asmlinkage __visible void do_softirq(void) - - local_irq_restore(flags); - } -+#endif - - /** - * irq_enter_rcu - Enter an interrupt context with RCU watching -@@ -355,9 +625,9 @@ void irq_enter_rcu(void) - * Prevent raise_softirq from needlessly waking up ksoftirqd - * here, as softirq will be serviced on return from interrupt. - */ -- local_bh_disable(); -+ local_bh_disable_irq_enter(); - tick_irq_enter(); -- _local_bh_enable(); -+ local_bh_enable_irq_enter(); - } - __irq_enter(); - } -@@ -371,32 +641,6 @@ void irq_enter(void) - irq_enter_rcu(); - } - --static inline void invoke_softirq(void) --{ -- if (ksoftirqd_running(local_softirq_pending())) -- return; -- -- if (!force_irqthreads) { --#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK -- /* -- * We can safely execute softirq on the current stack if -- * it is the irq stack, because it should be near empty -- * at this stage. -- */ -- __do_softirq(); --#else -- /* -- * Otherwise, irq_exit() is called on the task stack that can -- * be potentially deep already. So call softirq in its own stack -- * to prevent from any overrun. -- */ -- do_softirq_own_stack(); --#endif -- } else { -- wakeup_softirqd(); -- } --} -- - static inline void tick_irq_exit(void) - { - #ifdef CONFIG_NO_HZ_COMMON -@@ -466,7 +710,7 @@ inline void raise_softirq_irqoff(unsigned int nr) +@@ -463,7 +678,7 @@ inline void raise_softirq_irqoff(unsigned int nr) * Otherwise we wake up ksoftirqd to make sure we * schedule the softirq soon. */ @@ -21216,13 +15515,43 @@ index bf88d7f62433..102f5553884c 100644 wakeup_softirqd(); } -@@ -606,6 +850,29 @@ void tasklet_init(struct tasklet_struct *t, +@@ -529,6 +744,16 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + ++static inline bool tasklet_clear_sched(struct tasklet_struct *t) ++{ ++ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { ++ wake_up_var(&t->state); ++ return true; ++ } ++ ++ return false; ++} ++ + static void tasklet_action_common(struct softirq_action *a, + struct tasklet_head *tl_head, + unsigned int softirq_nr) +@@ -548,8 +773,7 @@ static void tasklet_action_common(struct softirq_action *a, + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { +- if (!test_and_clear_bit(TASKLET_STATE_SCHED, +- &t->state)) ++ if (!tasklet_clear_sched(t)) + BUG(); + if (t->use_callback) + t->callback(t); +@@ -604,21 +828,62 @@ void tasklet_init(struct tasklet_struct *t, } EXPORT_SYMBOL(tasklet_init); +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) -+ -+void tasklet_unlock_wait(struct tasklet_struct *t) ++/* ++ * Do not use in new code. There is no real reason to invoke this from ++ * atomic contexts. ++ */ ++void tasklet_unlock_spin_wait(struct tasklet_struct *t) +{ + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { @@ -21240,35 +15569,49 @@ index bf88d7f62433..102f5553884c 100644 + } + } +} -+EXPORT_SYMBOL(tasklet_unlock_wait); ++EXPORT_SYMBOL(tasklet_unlock_spin_wait); +#endif + void tasklet_kill(struct tasklet_struct *t) { if (in_interrupt()) -@@ -613,7 +880,20 @@ void tasklet_kill(struct tasklet_struct *t) + pr_notice("Attempt to kill tasklet from interrupt\n"); - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - do { +- while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { +- do { - yield(); -+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { -+ /* -+ * Prevent a live lock when current -+ * preempted soft interrupt processing or -+ * prevents ksoftirqd from running. If the -+ * tasklet runs on a different CPU then -+ * this has no effect other than doing the -+ * BH disable/enable dance for nothing. -+ */ -+ local_bh_disable(); -+ local_bh_enable(); -+ } else { -+ yield(); -+ } - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); - } +- } while (test_bit(TASKLET_STATE_SCHED, &t->state)); +- } ++ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) ++ wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); ++ tasklet_unlock_wait(t); -@@ -643,18 +923,18 @@ static int ksoftirqd_should_run(unsigned int cpu) +- clear_bit(TASKLET_STATE_SCHED, &t->state); ++ tasklet_clear_sched(t); + } + EXPORT_SYMBOL(tasklet_kill); + ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) ++void tasklet_unlock(struct tasklet_struct *t) ++{ ++ smp_mb__before_atomic(); ++ clear_bit(TASKLET_STATE_RUN, &t->state); ++ smp_mb__after_atomic(); ++ wake_up_var(&t->state); ++} ++EXPORT_SYMBOL_GPL(tasklet_unlock); ++ ++void tasklet_unlock_wait(struct tasklet_struct *t) ++{ ++ wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); ++} ++EXPORT_SYMBOL_GPL(tasklet_unlock_wait); ++#endif ++ + void __init softirq_init(void) + { + int cpu; +@@ -641,18 +906,18 @@ static int ksoftirqd_should_run(unsigned int cpu) static void run_ksoftirqd(unsigned int cpu) { @@ -21290,94 +15633,11 @@ index bf88d7f62433..102f5553884c 100644 } #ifdef CONFIG_HOTPLUG_CPU -diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c -index 865bb0228ab6..fb05a9293d00 100644 ---- a/kernel/stop_machine.c -+++ b/kernel/stop_machine.c -@@ -42,11 +42,23 @@ struct cpu_stopper { - struct list_head works; /* list of pending works */ - - struct cpu_stop_work stop_work; /* for stop_cpus */ -+ unsigned long caller; -+ cpu_stop_fn_t fn; - }; - - static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); - static bool stop_machine_initialized = false; - -+void print_stop_info(const char *log_lvl, struct task_struct *task) -+{ -+ struct cpu_stopper *stopper = this_cpu_ptr(&cpu_stopper); -+ -+ if (task != stopper->thread) -+ return; -+ -+ printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); -+} -+ - /* static data for stop_cpus */ - static DEFINE_MUTEX(stop_cpus_mutex); - static bool stop_cpus_in_progress; -@@ -123,7 +135,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) - int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) - { - struct cpu_stop_done done; -- struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; -+ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; - - cpu_stop_init_done(&done, 1); - if (!cpu_stop_queue_work(cpu, &work)) -@@ -331,7 +343,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * - work1 = work2 = (struct cpu_stop_work){ - .fn = multi_cpu_stop, - .arg = &msdata, -- .done = &done -+ .done = &done, -+ .caller = _RET_IP_, - }; - - cpu_stop_init_done(&done, 2); -@@ -367,7 +380,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * - bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, - struct cpu_stop_work *work_buf) - { -- *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; -+ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; - return cpu_stop_queue_work(cpu, work_buf); - } - -@@ -487,6 +500,8 @@ static void cpu_stopper_thread(unsigned int cpu) - int ret; - - /* cpu stop callbacks must not sleep, make in_atomic() == T */ -+ stopper->caller = work->caller; -+ stopper->fn = fn; - preempt_count_inc(); - ret = fn(arg); - if (done) { -@@ -495,6 +510,8 @@ static void cpu_stopper_thread(unsigned int cpu) - cpu_stop_signal_done(done); - } - preempt_count_dec(); -+ stopper->fn = NULL; -+ stopper->caller = 0; - WARN_ONCE(preempt_count(), - "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); - goto repeat; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 95b6a708b040..822fa0c7f5db 100644 +index 743c852e10f2..3c2e8b1dfbe3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c -@@ -1828,7 +1828,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - * expiry. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { -- if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) -+ if ((task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) || system_state != SYSTEM_RUNNING) - mode |= HRTIMER_MODE_HARD; - } - -@@ -1993,6 +1993,36 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, +@@ -1988,6 +1988,36 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, } #endif @@ -21414,152 +15674,633 @@ index 95b6a708b040..822fa0c7f5db 100644 /* * Functions related to boot-time initialization: */ -diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c -index 1c03eec6ca9b..0642013dace4 100644 ---- a/kernel/time/sched_clock.c -+++ b/kernel/time/sched_clock.c -@@ -35,7 +35,7 @@ - * into a single 64-byte cache line. - */ - struct clock_data { -- seqcount_t seq; -+ seqcount_latch_t seq; - struct clock_read_data read_data[2]; - ktime_t wrap_kt; - unsigned long rate; -@@ -76,7 +76,7 @@ struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c +index e10a4af88737..0cc55791b2b6 100644 +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -973,7 +973,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) + if (unlikely(local_softirq_pending())) { + static int ratelimit; + +- if (ratelimit < 10 && ++ if (ratelimit < 10 && !local_bh_blocked() && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", + (unsigned int) local_softirq_pending()); +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 8dbc008f8942..4d9ffd39f93e 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1263,8 +1263,10 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) + static void timer_sync_wait_running(struct timer_base *base) + { + if (atomic_read(&base->timer_waiters)) { ++ raw_spin_unlock_irq(&base->lock); + spin_unlock(&base->expiry_lock); + spin_lock(&base->expiry_lock); ++ raw_spin_lock_irq(&base->lock); + } + } + +@@ -1455,14 +1457,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) + if (timer->flags & TIMER_IRQSAFE) { + raw_spin_unlock(&base->lock); + call_timer_fn(timer, fn, baseclk); +- base->running_timer = NULL; + raw_spin_lock(&base->lock); ++ base->running_timer = NULL; + } else { + raw_spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, baseclk); ++ raw_spin_lock_irq(&base->lock); + base->running_timer = NULL; + timer_sync_wait_running(base); +- raw_spin_lock_irq(&base->lock); + } + } + } +@@ -1741,6 +1743,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + ++ irq_work_tick_soft(); ++ + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); +diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c +index fb0fe4c66b84..c54eae2ab208 100644 +--- a/kernel/trace/blktrace.c ++++ b/kernel/trace/blktrace.c +@@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, + struct blk_io_trace *t; + struct ring_buffer_event *event = NULL; + struct trace_buffer *buffer = NULL; +- int pc = 0; ++ unsigned int trace_ctx = 0; + int cpu = smp_processor_id(); + bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + + if (blk_tracer) { + buffer = blk_tr->array_buffer.buffer; +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + len + cgid_len, +- 0, pc); ++ trace_ctx); + if (!event) + return; + t = ring_buffer_event_data(event); +@@ -107,7 +107,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); + + if (blk_tracer) +- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + } + } + +@@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + struct blk_io_trace *t; + unsigned long flags = 0; + unsigned long *sequence; ++ unsigned int trace_ctx = 0; + pid_t pid; +- int cpu, pc = 0; ++ int cpu; + bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + +@@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + tracing_record_cmdline(current); + + buffer = blk_tr->array_buffer.buffer; +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + pdu_len + cgid_len, +- 0, pc); ++ trace_ctx); + if (!event) + return; + t = ring_buffer_event_data(event); +@@ -301,7 +302,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); + + if (blk_tracer) { +- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; + } + } +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index b5815a022ecc..e33e7bad3fac 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps; + int tracing_set_tracer(struct trace_array *tr, const char *buf); + static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + + #define MAX_TRACER_SIZE 100 + static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; +@@ -905,23 +905,23 @@ static inline void trace_access_lock_init(void) + + #ifdef CONFIG_STACKTRACE + static void __ftrace_trace_stack(struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs); ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs); + static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs); ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs); + + #else + static inline void __ftrace_trace_stack(struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs) + { + } + static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned long trace_ctx, ++ int skip, struct pt_regs *regs) + { + } + +@@ -929,24 +929,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr, + + static __always_inline void + trace_event_setup(struct ring_buffer_event *event, +- int type, unsigned long flags, int pc) ++ int type, unsigned int trace_ctx) + { + struct trace_entry *ent = ring_buffer_event_data(event); + +- tracing_generic_entry_update(ent, type, flags, pc); ++ tracing_generic_entry_update(ent, type, trace_ctx); + } + + static __always_inline struct ring_buffer_event * + __trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) +- trace_event_setup(event, type, flags, pc); ++ trace_event_setup(event, type, trace_ctx); + + return event; + } +@@ -1007,25 +1007,22 @@ int __trace_puts(unsigned long ip, const char *str, int size) + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct print_entry *entry; +- unsigned long irq_flags; ++ unsigned int trace_ctx; + int alloc; +- int pc; - int sched_clock_read_retry(unsigned int seq) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + return 0; + +- pc = preempt_count(); +- + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + +- local_save_flags(irq_flags); ++ trace_ctx = tracing_gen_ctx(); + buffer = global_trace.array_buffer.buffer; + ring_buffer_nest_start(buffer); +- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, +- irq_flags, pc); ++ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, ++ trace_ctx); + if (!event) { + size = 0; + goto out; +@@ -1044,7 +1041,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); ++ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + out: + ring_buffer_nest_end(buffer); + return size; +@@ -1061,25 +1058,22 @@ int __trace_bputs(unsigned long ip, const char *str) + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct bputs_entry *entry; +- unsigned long irq_flags; ++ unsigned int trace_ctx; + int size = sizeof(struct bputs_entry); + int ret = 0; +- int pc; + + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + return 0; + +- pc = preempt_count(); +- + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + +- local_save_flags(irq_flags); ++ trace_ctx = tracing_gen_ctx(); + buffer = global_trace.array_buffer.buffer; + + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, +- irq_flags, pc); ++ trace_ctx); + if (!event) + goto out; + +@@ -1088,7 +1082,7 @@ int __trace_bputs(unsigned long ip, const char *str) + entry->str = str; + + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); ++ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + + ret = 1; + out: +@@ -2584,36 +2578,52 @@ enum print_line_t trace_handle_return(struct trace_seq *s) + } + EXPORT_SYMBOL_GPL(trace_handle_return); + +-void +-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, +- unsigned long flags, int pc) ++static unsigned short migration_disable_value(void) + { +- struct task_struct *tsk = current; +- +- entry->preempt_count = pc & 0xff; +- entry->pid = (tsk) ? tsk->pid : 0; +- entry->type = type; +- entry->flags = +-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +- (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ return current->migration_disabled; + #else +- TRACE_FLAG_IRQS_NOSUPPORT | ++ return 0; ++#endif ++} ++ ++unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) ++{ ++ unsigned int trace_flags = irqs_status; ++ unsigned int pc; ++ ++ pc = preempt_count(); ++ ++ if (pc & NMI_MASK) ++ trace_flags |= TRACE_FLAG_NMI; ++ if (pc & HARDIRQ_MASK) ++ trace_flags |= TRACE_FLAG_HARDIRQ; ++ if (in_serving_softirq()) ++ trace_flags |= TRACE_FLAG_SOFTIRQ; ++ ++ if (tif_need_resched()) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED; ++ if (test_preempt_need_resched()) ++ trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; ++ ++#ifdef CONFIG_PREEMPT_LAZY ++ if (need_resched_lazy()) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; + #endif +- ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | +- ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | +- ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | +- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | +- (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); ++ ++ return (pc & 0xff) | ++ (migration_disable_value() & 0xff) << 8 | ++ (preempt_lazy_count() & 0xff) << 16 | ++ (trace_flags << 24); + } +-EXPORT_SYMBOL_GPL(tracing_generic_entry_update); + + struct ring_buffer_event * + trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { +- return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); ++ return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); + } + + DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +@@ -2733,7 +2743,7 @@ struct ring_buffer_event * + trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + struct trace_event_file *trace_file, + int type, unsigned long len, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct ring_buffer_event *entry; + int val; +@@ -2746,7 +2756,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + /* Try to use the per cpu buffer first */ + val = this_cpu_inc_return(trace_buffered_event_cnt); + if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) { +- trace_event_setup(entry, type, flags, pc); ++ trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + return entry; + } +@@ -2754,7 +2764,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + } + + entry = __trace_buffer_lock_reserve(*current_rb, +- type, len, flags, pc); ++ type, len, trace_ctx); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer +@@ -2763,8 +2773,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + */ + if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { + *current_rb = temp_buffer; +- entry = __trace_buffer_lock_reserve(*current_rb, +- type, len, flags, pc); ++ entry = __trace_buffer_lock_reserve(*current_rb, type, len, ++ trace_ctx); + } + return entry; + } +@@ -2850,7 +2860,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) + ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); + event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, +- fbuffer->flags, fbuffer->pc, fbuffer->regs); ++ fbuffer->trace_ctx, fbuffer->regs); + } + EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + +@@ -2866,7 +2876,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- unsigned long flags, int pc, ++ unsigned int trace_ctx, + struct pt_regs *regs) + { + __buffer_unlock_commit(buffer, event); +@@ -2877,8 +2887,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, + * and mmiotrace, but that's ok if they lose a function or + * two. They are not that meaningful. + */ +- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); +- ftrace_trace_userstack(tr, buffer, flags, pc); ++ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); ++ ftrace_trace_userstack(tr, buffer, trace_ctx); + } + + /* +@@ -2892,9 +2902,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, + } + + void +-trace_function(struct trace_array *tr, +- unsigned long ip, unsigned long parent_ip, unsigned long flags, +- int pc) ++trace_function(struct trace_array *tr, unsigned long ip, unsigned long ++ parent_ip, unsigned int trace_ctx) + { + struct trace_event_call *call = &event_function; + struct trace_buffer *buffer = tr->array_buffer.buffer; +@@ -2902,7 +2911,7 @@ trace_function(struct trace_array *tr, + struct ftrace_entry *entry; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), +- flags, pc); ++ trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -2936,8 +2945,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); + static DEFINE_PER_CPU(int, ftrace_stack_reserve); + + static void __ftrace_trace_stack(struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs) + { + struct trace_event_call *call = &event_kernel_stack; + struct ring_buffer_event *event; +@@ -2984,7 +2993,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, + + size = nr_entries * sizeof(unsigned long); + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, +- sizeof(*entry) + size, flags, pc); ++ sizeof(*entry) + size, trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3005,22 +3014,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, + + static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs) + { + if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) + return; + +- __ftrace_trace_stack(buffer, flags, skip, pc, regs); ++ __ftrace_trace_stack(buffer, trace_ctx, skip, regs); + } + +-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, +- int pc) ++void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, ++ int skip) { -- return read_seqcount_retry(&cd.seq, seq); -+ return read_seqcount_latch_retry(&cd.seq, seq); + struct trace_buffer *buffer = tr->array_buffer.buffer; + + if (rcu_is_watching()) { +- __ftrace_trace_stack(buffer, flags, skip, pc, NULL); ++ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + return; + } + +@@ -3034,7 +3043,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + return; + + rcu_irq_enter_irqson(); +- __ftrace_trace_stack(buffer, flags, skip, pc, NULL); ++ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + rcu_irq_exit_irqson(); } - unsigned long long notrace sched_clock(void) -@@ -258,7 +258,7 @@ void __init generic_sched_clock_init(void) +@@ -3044,19 +3053,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, */ - static u64 notrace suspended_sched_clock_read(void) + void trace_dump_stack(int skip) { -- unsigned int seq = raw_read_seqcount(&cd.seq); -+ unsigned int seq = raw_read_seqcount_latch(&cd.seq); +- unsigned long flags; +- + if (tracing_disabled || tracing_selftest_running) + return; - return cd.read_data[seq & 1].epoch_cyc; +- local_save_flags(flags); +- + #ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; + #endif + __ftrace_trace_stack(global_trace.array_buffer.buffer, +- flags, skip, preempt_count(), NULL); ++ tracing_gen_ctx(), skip, NULL); } -diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c -index f0199a4ba1ad..1de4665dc52f 100644 ---- a/kernel/time/tick-sched.c -+++ b/kernel/time/tick-sched.c -@@ -925,7 +925,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) - if (unlikely(local_softirq_pending())) { - static int ratelimit; + EXPORT_SYMBOL_GPL(trace_dump_stack); -- if (ratelimit < 10 && -+ if (ratelimit < 10 && !local_bh_blocked() && - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); -diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c -index 4c47f388a83f..999c981ae766 100644 ---- a/kernel/time/timekeeping.c -+++ b/kernel/time/timekeeping.c -@@ -64,7 +64,7 @@ static struct timekeeper shadow_timekeeper; - * See @update_fast_timekeeper() below. - */ - struct tk_fast { -- seqcount_raw_spinlock_t seq; -+ seqcount_latch_t seq; - struct tk_read_base base[2]; - }; +@@ -3065,7 +3070,7 @@ static DEFINE_PER_CPU(int, user_stack_count); -@@ -81,13 +81,13 @@ static struct clocksource dummy_clock = { - }; + static void + ftrace_trace_userstack(struct trace_array *tr, +- struct trace_buffer *buffer, unsigned long flags, int pc) ++ struct trace_buffer *buffer, unsigned int trace_ctx) + { + struct trace_event_call *call = &event_user_stack; + struct ring_buffer_event *event; +@@ -3092,7 +3097,7 @@ ftrace_trace_userstack(struct trace_array *tr, + __this_cpu_inc(user_stack_count); + + event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + goto out_drop_count; + entry = ring_buffer_event_data(event); +@@ -3112,7 +3117,7 @@ ftrace_trace_userstack(struct trace_array *tr, + #else /* CONFIG_USER_STACKTRACE_SUPPORT */ + static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + } + #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ +@@ -3242,9 +3247,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + struct trace_buffer *buffer; + struct trace_array *tr = &global_trace; + struct bprint_entry *entry; +- unsigned long flags; ++ unsigned int trace_ctx; + char *tbuffer; +- int len = 0, size, pc; ++ int len = 0, size; - static struct tk_fast tk_fast_mono ____cacheline_aligned = { -- .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), -+ .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, - }; + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; +@@ -3252,7 +3257,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); - static struct tk_fast tk_fast_raw ____cacheline_aligned = { -- .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), -+ .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, - }; -@@ -467,7 +467,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); -- } while (read_seqcount_retry(&tkf->seq, seq)); -+ } while (read_seqcount_latch_retry(&tkf->seq, seq)); +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); - return now; - } -@@ -533,7 +533,7 @@ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); -- } while (read_seqcount_retry(&tkf->seq, seq)); -+ } while (read_seqcount_latch_retry(&tkf->seq, seq)); + tbuffer = get_trace_buf(); +@@ -3266,12 +3271,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out_put; + +- local_save_flags(flags); + size = sizeof(*entry) + sizeof(u32) * len; + buffer = tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, +- flags, pc); ++ trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3281,7 +3285,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); ++ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } - return now; - } -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index a50364df1054..e25cb9d7f09a 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1765,6 +1765,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) + out: +@@ -3304,9 +3308,9 @@ __trace_array_vprintk(struct trace_buffer *buffer, { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + struct trace_event_call *call = &event_print; + struct ring_buffer_event *event; +- int len = 0, size, pc; ++ int len = 0, size; + struct print_entry *entry; +- unsigned long flags; ++ unsigned int trace_ctx; + char *tbuffer; -+ irq_work_tick_soft(); -+ - __run_timers(base); - if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) - __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); -diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c -index d3e5de717df2..21706f318a00 100644 ---- a/kernel/trace/trace.c -+++ b/kernel/trace/trace.c -@@ -2437,6 +2437,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s) - } - EXPORT_SYMBOL_GPL(trace_handle_return); + if (tracing_disabled || tracing_selftest_running) +@@ -3315,7 +3319,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); -+static unsigned short migration_disable_value(struct task_struct *tsk) -+{ -+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) -+ return tsk ? tsk->migration_disabled : 0; -+#else -+ return 0; -+#endif -+} -+ - void - tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, - unsigned long flags, int pc) -@@ -2444,6 +2453,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, - struct task_struct *tsk = current; - - entry->preempt_count = pc & 0xff; -+ entry->preempt_lazy_count = preempt_lazy_count(); - entry->pid = (tsk) ? tsk->pid : 0; - entry->type = type; - entry->flags = -@@ -2455,8 +2465,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, - ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | - ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | -- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | -+ (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) | -+ (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) | - (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); -+ -+ entry->migrate_disable = migration_disable_value(tsk); - } - EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -@@ -3784,14 +3797,17 @@ unsigned long trace_total_entries(struct trace_array *tr) +@@ -3327,11 +3331,10 @@ __trace_array_vprintk(struct trace_buffer *buffer, + + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + +- local_save_flags(flags); + size = sizeof(*entry) + len + 1; + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, +- flags, pc); ++ trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3340,7 +3343,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, + memcpy(&entry->buf, tbuffer, len + 1); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); ++ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); + } + + out: +@@ -3812,14 +3815,17 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { @@ -21585,7 +16326,7 @@ index d3e5de717df2..21706f318a00 100644 } static void print_event_info(struct array_buffer *buf, struct seq_file *m) -@@ -3825,13 +3841,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file +@@ -3853,13 +3859,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); @@ -21608,58 +16349,904 @@ index d3e5de717df2..21706f318a00 100644 + seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); } - void -@@ -9249,7 +9268,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) - tracing_off(); + void +@@ -6653,7 +6662,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, + enum event_trigger_type tt = ETT_NONE; + struct trace_buffer *buffer; + struct print_entry *entry; +- unsigned long irq_flags; + ssize_t written; + int size; + int len; +@@ -6673,7 +6681,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + +- local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + + /* If less than "<faulted>", then make sure we can still add that */ +@@ -6682,7 +6689,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, + + buffer = tr->array_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, +- irq_flags, preempt_count()); ++ tracing_gen_ctx()); + if (unlikely(!event)) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; +@@ -6734,7 +6741,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; +- unsigned long irq_flags; + ssize_t written; + int size; + int len; +@@ -6756,14 +6762,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + +- local_save_flags(irq_flags); + size = sizeof(*entry) + cnt; + if (cnt < FAULT_SIZE_ID) + size += FAULT_SIZE_ID - cnt; + + buffer = tr->array_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, +- irq_flags, preempt_count()); ++ tracing_gen_ctx()); + if (!event) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; +@@ -9344,7 +9349,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) + tracing_off(); + + local_irq_save(flags); +- printk_nmi_direct_enter(); + + /* Simulate the iterator */ + trace_init_global_iter(&iter); +@@ -9424,7 +9428,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) + atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + } + atomic_dec(&dump_running); +- printk_nmi_direct_exit(); + local_irq_restore(flags); + } + EXPORT_SYMBOL_GPL(ftrace_dump); +diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h +index e448d2da0b99..93fb08ab8bb6 100644 +--- a/kernel/trace/trace.h ++++ b/kernel/trace/trace.h +@@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head { + unsigned long ret_ip; + }; + +-/* +- * trace_flag_type is an enumeration that holds different +- * states when a trace occurs. These are: +- * IRQS_OFF - interrupts were disabled +- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags +- * NEED_RESCHED - reschedule is requested +- * HARDIRQ - inside an interrupt handler +- * SOFTIRQ - inside a softirq handler +- */ +-enum trace_flag_type { +- TRACE_FLAG_IRQS_OFF = 0x01, +- TRACE_FLAG_IRQS_NOSUPPORT = 0x02, +- TRACE_FLAG_NEED_RESCHED = 0x04, +- TRACE_FLAG_HARDIRQ = 0x08, +- TRACE_FLAG_SOFTIRQ = 0x10, +- TRACE_FLAG_PREEMPT_RESCHED = 0x20, +- TRACE_FLAG_NMI = 0x40, +-}; +- + #define TRACE_BUF_SIZE 1024 + + struct trace_array; +@@ -589,8 +570,7 @@ struct ring_buffer_event * + trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, +- unsigned long flags, +- int pc); ++ unsigned int trace_ctx); + + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); +@@ -615,11 +595,11 @@ unsigned long trace_total_entries(struct trace_array *tr); + void trace_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + void trace_latency_header(struct seq_file *m); + void trace_default_header(struct seq_file *m); + void print_trace_header(struct seq_file *m, struct trace_iterator *iter); +@@ -687,11 +667,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { } + #endif + + #ifdef CONFIG_STACKTRACE +-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, +- int pc); ++void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); + #else +-static inline void __trace_stack(struct trace_array *tr, unsigned long flags, +- int skip, int pc) ++static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, ++ int skip) + { + } + #endif /* CONFIG_STACKTRACE */ +@@ -831,10 +810,10 @@ extern void graph_trace_open(struct trace_iterator *iter); + extern void graph_trace_close(struct trace_iterator *iter); + extern int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + extern void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + + #ifdef CONFIG_DYNAMIC_FTRACE + extern struct ftrace_hash __rcu *ftrace_graph_hash; +@@ -1297,15 +1276,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec, + void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- unsigned long flags, int pc, ++ unsigned int trcace_ctx, + struct pt_regs *regs); + + static inline void trace_buffer_unlock_commit(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { +- trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); ++ trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); + } + + DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +@@ -1366,8 +1345,7 @@ __event_trigger_test_discard(struct trace_event_file *file, + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself +- * @irq_flags: The state of the interrupts at the start of the event +- * @pc: The state of the preempt count at the start of the event. ++ * @trace_ctx: The tracing context flags. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and +@@ -1377,12 +1355,12 @@ static inline void + event_trigger_unlock_commit(struct trace_event_file *file, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- void *entry, unsigned long irq_flags, int pc) ++ void *entry, unsigned int trace_ctx) + { + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) +- trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); ++ trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx); + + if (tt) + event_triggers_post_call(file, tt); +@@ -1394,8 +1372,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself +- * @irq_flags: The state of the interrupts at the start of the event +- * @pc: The state of the preempt count at the start of the event. ++ * @trace_ctx: The tracing context flags. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and +@@ -1408,14 +1385,14 @@ static inline void + event_trigger_unlock_commit_regs(struct trace_event_file *file, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- void *entry, unsigned long irq_flags, int pc, ++ void *entry, unsigned int trace_ctx, + struct pt_regs *regs) + { + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(file->tr, buffer, event, +- irq_flags, pc, regs); ++ trace_ctx, regs); + + if (tt) + event_triggers_post_call(file, tt); +diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c +index eff099123aa2..e47fdb4c92fb 100644 +--- a/kernel/trace/trace_branch.c ++++ b/kernel/trace/trace_branch.c +@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) + struct ring_buffer_event *event; + struct trace_branch *entry; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + const char *p; + + if (current->trace_recursion & TRACE_BRANCH_BIT) +@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) + if (atomic_read(&data->disabled)) + goto out; + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx_flags(flags); + buffer = tr->array_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + goto out; + +diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c +index a71181655958..288ad2c274fb 100644 +--- a/kernel/trace/trace_event_perf.c ++++ b/kernel/trace/trace_event_perf.c +@@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc); + void perf_trace_buf_update(void *record, u16 type) + { + struct trace_entry *entry = record; +- int pc = preempt_count(); +- unsigned long flags; + +- local_save_flags(flags); +- tracing_generic_entry_update(entry, type, flags, pc); ++ tracing_generic_entry_update(entry, type, tracing_gen_ctx()); + } + NOKPROBE_SYMBOL(perf_trace_buf_update); + +diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c +index d387b774ceeb..a7c176251c63 100644 +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -183,6 +183,8 @@ static int trace_define_common_fields(void) + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); ++ __common_field(unsigned char, migrate_disable); ++ __common_field(unsigned char, preempt_lazy_count); + + return ret; + } +@@ -258,22 +260,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, + trace_event_ignore_this_pid(trace_file)) + return NULL; + +- local_save_flags(fbuffer->flags); +- fbuffer->pc = preempt_count(); + /* + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ +- if (IS_ENABLED(CONFIG_PREEMPTION)) +- fbuffer->pc--; ++ fbuffer->trace_ctx = tracing_gen_ctx_dec(); + fbuffer->trace_file = trace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, + event_call->event.type, len, +- fbuffer->flags, fbuffer->pc); ++ fbuffer->trace_ctx); + if (!fbuffer->event) + return NULL; + +@@ -3679,12 +3678,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct trace_buffer *buffer; + struct ring_buffer_event *event; + struct ftrace_entry *entry; +- unsigned long flags; ++ unsigned int trace_ctx; + long disabled; + int cpu; +- int pc; + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); +@@ -3692,11 +3690,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, + if (disabled != 1) + goto out; + +- local_save_flags(flags); +- + event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, + TRACE_FN, sizeof(*entry), +- flags, pc); ++ trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3704,7 +3700,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, + entry->parent_ip = parent_ip; + + event_trigger_unlock_commit(&event_trace_file, buffer, event, +- entry, flags, pc); ++ entry, trace_ctx); + out: + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); + preempt_enable_notrace(); +diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c +index 22bcf7c51d1e..c188045c5f97 100644 +--- a/kernel/trace/trace_events_inject.c ++++ b/kernel/trace/trace_events_inject.c +@@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) + static int parse_entry(char *str, struct trace_event_call *call, void **pentry) + { + struct ftrace_event_field *field; +- unsigned long irq_flags; + void *entry = NULL; + int entry_size; + u64 val = 0; +@@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) + if (!entry) + return -ENOMEM; + +- local_save_flags(irq_flags); +- tracing_generic_entry_update(entry, call->event.type, irq_flags, +- preempt_count()); ++ tracing_generic_entry_update(entry, call->event.type, ++ tracing_gen_ctx()); + + while ((len = parse_field(str, call, &field, &val)) > 0) { + if (is_function_field(field)) +diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c +index c5095dd28e20..9bb03ab44fff 100644 +--- a/kernel/trace/trace_functions.c ++++ b/kernel/trace/trace_functions.c +@@ -132,10 +132,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, + { + struct trace_array *tr = op->private; + struct trace_array_cpu *data; +- unsigned long flags; ++ unsigned int trace_ctx; + int bit; + int cpu; +- int pc; + + if (unlikely(!tr->function_enabled)) + return; +@@ -144,15 +143,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, + if (bit < 0) + return; + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); +- if (!atomic_read(&data->disabled)) { +- local_save_flags(flags); +- trace_function(tr, ip, parent_ip, flags, pc); +- } ++ if (!atomic_read(&data->disabled)) ++ trace_function(tr, ip, parent_ip, trace_ctx); ++ + ftrace_test_recursion_unlock(bit); + preempt_enable_notrace(); + } +@@ -184,7 +182,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + unsigned long flags; + long disabled; + int cpu; +- int pc; ++ unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; +@@ -199,9 +197,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { +- pc = preempt_count(); +- trace_function(tr, ip, parent_ip, flags, pc); +- __trace_stack(tr, flags, STACK_SKIP, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ trace_function(tr, ip, parent_ip, trace_ctx); ++ __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + atomic_dec(&data->disabled); +@@ -404,13 +402,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, + + static __always_inline void trace_stack(struct trace_array *tr) + { +- unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + +- local_save_flags(flags); +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + +- __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); ++ __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + } + + static void +diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c +index d874dec87131..0aa6e6faa943 100644 +--- a/kernel/trace/trace_functions_graph.c ++++ b/kernel/trace/trace_functions_graph.c +@@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, + + int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, +- unsigned long flags, +- int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; +@@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return 0; + entry = ring_buffer_event_data(event); +@@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + long disabled; + int ret; + int cpu; +- int pc; + + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) + return 0; +@@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { +- pc = preempt_count(); +- ret = __trace_graph_entry(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ ret = __trace_graph_entry(tr, trace, trace_ctx); + } else { + ret = 0; + } +@@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) + + static void + __trace_graph_function(struct trace_array *tr, +- unsigned long ip, unsigned long flags, int pc) ++ unsigned long ip, unsigned int trace_ctx) + { + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { +@@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr, + .rettime = time, + }; + +- __trace_graph_entry(tr, &ent, flags, pc); +- __trace_graph_return(tr, &ret, flags, pc); ++ __trace_graph_entry(tr, &ent, trace_ctx); ++ __trace_graph_return(tr, &ret, trace_ctx); + } + + void + trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { +- __trace_graph_function(tr, ip, flags, pc); ++ __trace_graph_function(tr, ip, trace_ctx); + } + + void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, +- unsigned long flags, +- int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; +@@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace) + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + long disabled; + int cpu; +- int pc; + + ftrace_graph_addr_finish(trace); + +@@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { +- pc = preempt_count(); +- __trace_graph_return(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ __trace_graph_return(tr, trace, trace_ctx); + } + atomic_dec(&data->disabled); + local_irq_restore(flags); +diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c +index c0df9b97f147..34dc1a712dcb 100644 +--- a/kernel/trace/trace_hwlat.c ++++ b/kernel/trace/trace_hwlat.c +@@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct hwlat_entry *entry; +- unsigned long flags; +- int pc; +- +- pc = preempt_count(); +- local_save_flags(flags); + + event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), +- flags, pc); ++ tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); +diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c +index 6756379b661f..590b3d51afae 100644 +--- a/kernel/trace/trace_irqsoff.c ++++ b/kernel/trace/trace_irqsoff.c +@@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + +- trace_function(tr, ip, parent_ip, flags, preempt_count()); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ ++ trace_function(tr, ip, parent_ip, trace_ctx); + + atomic_dec(&data->disabled); + } +@@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + int ret; +- int pc; + + if (ftrace_graph_ignore_func(trace)) + return 0; +@@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) + if (!func_prolog_dec(tr, &data, &flags)) + return 0; + +- pc = preempt_count(); +- ret = __trace_graph_entry(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ ret = __trace_graph_entry(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + + return ret; +@@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + + ftrace_graph_addr_finish(trace); + + if (!func_prolog_dec(tr, &data, &flags)) + return; + +- pc = preempt_count(); +- __trace_graph_return(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ __trace_graph_return(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + } + +@@ -267,12 +270,12 @@ static void irqsoff_print_header(struct seq_file *s) + static void + __trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + if (is_graph(tr)) +- trace_graph_function(tr, ip, parent_ip, flags, pc); ++ trace_graph_function(tr, ip, parent_ip, trace_ctx); + else +- trace_function(tr, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, trace_ctx); + } + + #else +@@ -322,15 +325,13 @@ check_critical_timing(struct trace_array *tr, + { + u64 T0, T1, delta; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + +- local_save_flags(flags); +- +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + + if (!report_latency(tr, delta)) + goto out; +@@ -341,9 +342,9 @@ check_critical_timing(struct trace_array *tr, + if (!report_latency(tr, delta)) + goto out_unlock; + +- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); ++ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); + /* Skip 5 functions to get to the irq/preempt enable function */ +- __trace_stack(tr, flags, 5, pc); ++ __trace_stack(tr, trace_ctx, 5); + + if (data->critical_sequence != max_sequence) + goto out_unlock; +@@ -363,16 +364,15 @@ check_critical_timing(struct trace_array *tr, + out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); +- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); ++ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); + } + + static nokprobe_inline void +-start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) ++start_critical_timing(unsigned long ip, unsigned long parent_ip) + { + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; +- unsigned long flags; + + if (!tracer_enabled || !tracing_is_enabled()) + return; +@@ -393,9 +393,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + +- local_save_flags(flags); +- +- __trace_function(tr, ip, parent_ip, flags, pc); ++ __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + + per_cpu(tracing_cpu, cpu) = 1; + +@@ -403,12 +401,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + } + + static nokprobe_inline void +-stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) ++stop_critical_timing(unsigned long ip, unsigned long parent_ip) + { + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; +- unsigned long flags; ++ unsigned int trace_ctx; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ +@@ -428,8 +426,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + + atomic_inc(&data->disabled); + +- local_save_flags(flags); +- __trace_function(tr, ip, parent_ip, flags, pc); ++ trace_ctx = tracing_gen_ctx(); ++ __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +@@ -438,20 +436,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + /* start and stop critical timings used to for stoppage (in idle) */ + void start_critical_timings(void) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) || irq_trace()) +- start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); ++ if (preempt_trace(preempt_count()) || irq_trace()) ++ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } + EXPORT_SYMBOL_GPL(start_critical_timings); + NOKPROBE_SYMBOL(start_critical_timings); + + void stop_critical_timings(void) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) || irq_trace()) +- stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); ++ if (preempt_trace(preempt_count()) || irq_trace()) ++ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } + EXPORT_SYMBOL_GPL(stop_critical_timings); + NOKPROBE_SYMBOL(stop_critical_timings); +@@ -613,19 +607,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr) + */ + void tracer_hardirqs_on(unsigned long a0, unsigned long a1) + { +- unsigned int pc = preempt_count(); +- +- if (!preempt_trace(pc) && irq_trace()) +- stop_critical_timing(a0, a1, pc); ++ if (!preempt_trace(preempt_count()) && irq_trace()) ++ stop_critical_timing(a0, a1); + } + NOKPROBE_SYMBOL(tracer_hardirqs_on); + + void tracer_hardirqs_off(unsigned long a0, unsigned long a1) + { +- unsigned int pc = preempt_count(); +- +- if (!preempt_trace(pc) && irq_trace()) +- start_critical_timing(a0, a1, pc); ++ if (!preempt_trace(preempt_count()) && irq_trace()) ++ start_critical_timing(a0, a1); + } + NOKPROBE_SYMBOL(tracer_hardirqs_off); + +@@ -665,18 +655,14 @@ static struct tracer irqsoff_tracer __read_mostly = + #ifdef CONFIG_PREEMPT_TRACER + void tracer_preempt_on(unsigned long a0, unsigned long a1) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) && !irq_trace()) +- stop_critical_timing(a0, a1, pc); ++ if (preempt_trace(preempt_count()) && !irq_trace()) ++ stop_critical_timing(a0, a1); + } + + void tracer_preempt_off(unsigned long a0, unsigned long a1) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) && !irq_trace()) +- start_critical_timing(a0, a1, pc); ++ if (preempt_trace(preempt_count()) && !irq_trace()) ++ start_critical_timing(a0, a1); + } + + static int preemptoff_tracer_init(struct trace_array *tr) +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index 56c7fbff7bd7..f6c459aba8a6 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -1386,8 +1386,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, + if (trace_trigger_soft_disabled(trace_file)) + return; - local_irq_save(flags); -- printk_nmi_direct_enter(); +- local_save_flags(fbuffer.flags); +- fbuffer.pc = preempt_count(); ++ fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = trace_file; + + dsize = __get_data_size(&tk->tp, regs); +@@ -1396,7 +1395,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, + trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, + call->event.type, + sizeof(*entry) + tk->tp.size + dsize, +- fbuffer.flags, fbuffer.pc); ++ fbuffer.trace_ctx); + if (!fbuffer.event) + return; - /* Simulate the iterator */ - trace_init_global_iter(&iter); -@@ -9329,7 +9347,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } - atomic_dec(&dump_running); -- printk_nmi_direct_exit(); - local_irq_restore(flags); - } - EXPORT_SYMBOL_GPL(ftrace_dump); -diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h -index 610d21355526..580f2355a835 100644 ---- a/kernel/trace/trace.h -+++ b/kernel/trace/trace.h -@@ -143,6 +143,7 @@ struct kretprobe_trace_entry_head { - * NEED_RESCHED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler -+ * NEED_RESCHED_LAZY - lazy reschedule is requested - */ - enum trace_flag_type { - TRACE_FLAG_IRQS_OFF = 0x01, -@@ -152,6 +153,7 @@ enum trace_flag_type { - TRACE_FLAG_SOFTIRQ = 0x10, - TRACE_FLAG_PREEMPT_RESCHED = 0x20, - TRACE_FLAG_NMI = 0x40, -+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, - }; +@@ -1434,8 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + if (trace_trigger_soft_disabled(trace_file)) + return; - #define TRACE_BUF_SIZE 1024 -diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c -index a85effb2373b..a88a78c240bb 100644 ---- a/kernel/trace/trace_events.c -+++ b/kernel/trace/trace_events.c -@@ -182,6 +182,8 @@ static int trace_define_common_fields(void) - __common_field(unsigned char, flags); - __common_field(unsigned char, preempt_count); - __common_field(int, pid); -+ __common_field(unsigned char, migrate_disable); -+ __common_field(unsigned char, preempt_lazy_count); +- local_save_flags(fbuffer.flags); +- fbuffer.pc = preempt_count(); ++ fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = trace_file; + + dsize = __get_data_size(&tk->tp, regs); +@@ -1443,7 +1441,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, + call->event.type, + sizeof(*entry) + tk->tp.size + dsize, +- fbuffer.flags, fbuffer.pc); ++ fbuffer.trace_ctx); + if (!fbuffer.event) + return; - return ret; +diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c +index 84582bf1ed5f..7221ae0b4c47 100644 +--- a/kernel/trace/trace_mmiotrace.c ++++ b/kernel/trace/trace_mmiotrace.c +@@ -300,10 +300,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; +- int pc = preempt_count(); ++ unsigned int trace_ctx; + ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, +- sizeof(*entry), 0, pc); ++ sizeof(*entry), trace_ctx); + if (!event) { + atomic_inc(&dropped_count); + return; +@@ -312,7 +313,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, + entry->rw = *rw; + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + } + + void mmio_trace_rw(struct mmiotrace_rw *rw) +@@ -330,10 +331,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; +- int pc = preempt_count(); ++ unsigned int trace_ctx; + ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, +- sizeof(*entry), 0, pc); ++ sizeof(*entry), trace_ctx); + if (!event) { + atomic_inc(&dropped_count); + return; +@@ -342,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, + entry->map = *map; + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } + + void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c -index 000e9dc224c6..bc24ae8e3613 100644 +index 92b1575ae0ca..f80fa69b6943 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -441,6 +441,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) @@ -21708,26 +17295,348 @@ index 000e9dc224c6..bc24ae8e3613 100644 return !trace_seq_has_overflowed(s); } -diff --git a/kernel/workqueue.c b/kernel/workqueue.c -index c41c3c17b86a..bb8a84dbabb9 100644 ---- a/kernel/workqueue.c -+++ b/kernel/workqueue.c -@@ -4905,6 +4905,10 @@ static void unbind_workers(int cpu) - pool->flags |= POOL_DISASSOCIATED; +diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c +index c0181066dbe9..e5778d1d7a5b 100644 +--- a/kernel/trace/trace_sched_wakeup.c ++++ b/kernel/trace/trace_sched_wakeup.c +@@ -67,7 +67,7 @@ static bool function_enabled; + static int + func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, +- int *pc) ++ unsigned int *trace_ctx) + { + long disabled; + int cpu; +@@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr, + if (likely(!wakeup_task)) + return 0; + +- *pc = preempt_count(); ++ *trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); +@@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) + { + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; +- unsigned long flags; +- int pc, ret = 0; ++ unsigned int trace_ctx; ++ int ret = 0; + + if (ftrace_graph_ignore_func(trace)) + return 0; +@@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + +- if (!func_prolog_preempt_disable(tr, &data, &pc)) ++ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return 0; + +- local_save_flags(flags); +- ret = __trace_graph_entry(tr, trace, flags, pc); ++ ret = __trace_graph_entry(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + +@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) + { + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; +- unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + + ftrace_graph_addr_finish(trace); + +- if (!func_prolog_preempt_disable(tr, &data, &pc)) ++ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return; + +- local_save_flags(flags); +- __trace_graph_return(tr, trace, flags, pc); ++ __trace_graph_return(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); +@@ -217,13 +214,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + +- if (!func_prolog_preempt_disable(tr, &data, &pc)) ++ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return; + + local_irq_save(flags); +- trace_function(tr, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, trace_ctx); + local_irq_restore(flags); + + atomic_dec(&data->disabled); +@@ -303,12 +300,12 @@ static void wakeup_print_header(struct seq_file *s) + static void + __trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + if (is_graph(tr)) +- trace_graph_function(tr, ip, parent_ip, flags, pc); ++ trace_graph_function(tr, ip, parent_ip, trace_ctx); + else +- trace_function(tr, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, trace_ctx); + } + + static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +@@ -375,7 +372,7 @@ static void + tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_context_switch; + struct trace_buffer *buffer = tr->array_buffer.buffer; +@@ -383,7 +380,7 @@ tracing_sched_switch_trace(struct trace_array *tr, + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -396,14 +393,14 @@ tracing_sched_switch_trace(struct trace_array *tr, + entry->next_cpu = task_cpu(next); + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, flags, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + } + + static void + tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; +@@ -411,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_buffer *buffer = tr->array_buffer.buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -424,7 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, + entry->next_cpu = task_cpu(wakee); + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, flags, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + } + + static void notrace +@@ -436,7 +433,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + unsigned long flags; + long disabled; + int cpu; +- int pc; ++ unsigned int trace_ctx; + + tracing_record_cmdline(prev); + +@@ -455,8 +452,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + if (next != wakeup_task) + return; + +- pc = preempt_count(); +- + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); +@@ -464,6 +459,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + goto out; - raw_spin_unlock_irq(&pool->lock); + local_irq_save(flags); ++ trace_ctx = tracing_gen_ctx_flags(flags); + -+ for_each_pool_worker(worker, pool) -+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); + arch_spin_lock(&wakeup_lock); + + /* We could race with grabbing wakeup_lock */ +@@ -473,9 +470,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + /* The task we are waiting for is waking up */ + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); + +- __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); +- tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); +- __trace_stack(wakeup_trace, flags, 0, pc); ++ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx); ++ tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx); ++ __trace_stack(wakeup_trace, trace_ctx, 0); + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); +@@ -527,9 +524,8 @@ probe_wakeup(void *ignore, struct task_struct *p) + { + struct trace_array_cpu *data; + int cpu = smp_processor_id(); +- unsigned long flags; + long disabled; +- int pc; ++ unsigned int trace_ctx; + + if (likely(!tracer_enabled)) + return; +@@ -550,11 +546,12 @@ probe_wakeup(void *ignore, struct task_struct *p) + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) + return; + +- pc = preempt_count(); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + if (unlikely(disabled != 1)) + goto out; + ++ trace_ctx = tracing_gen_ctx(); + - mutex_unlock(&wq_pool_attach_mutex); + /* interrupts should be off from try_to_wake_up */ + arch_spin_lock(&wakeup_lock); - /* +@@ -581,19 +578,17 @@ probe_wakeup(void *ignore, struct task_struct *p) + + wakeup_task = get_task_struct(p); + +- local_save_flags(flags); +- + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); + data->preempt_timestamp = ftrace_now(cpu); +- tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); +- __trace_stack(wakeup_trace, flags, 0, pc); ++ tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx); ++ __trace_stack(wakeup_trace, trace_ctx, 0); + + /* + * We must be careful in using CALLER_ADDR2. But since wake_up + * is not called by an assembly function (where as schedule is) + * it should be safe to use it here. + */ +- __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); ++ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx); + + out_locked: + arch_spin_unlock(&wakeup_lock); +diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c +index d85a2f0f316b..8bfcd3b09422 100644 +--- a/kernel/trace/trace_syscalls.c ++++ b/kernel/trace/trace_syscalls.c +@@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct trace_buffer *buffer; +- unsigned long irq_flags; ++ unsigned int trace_ctx; + unsigned long args[6]; +- int pc; + int syscall_nr; + int size; + +@@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) + + size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + +- local_save_flags(irq_flags); +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + + buffer = tr->array_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, +- sys_data->enter_event->event.type, size, irq_flags, pc); ++ sys_data->enter_event->event.type, size, trace_ctx); + if (!event) + return; + +@@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + + event_trigger_unlock_commit(trace_file, buffer, event, entry, +- irq_flags, pc); ++ trace_ctx); + } + + static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) +@@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct trace_buffer *buffer; +- unsigned long irq_flags; +- int pc; ++ unsigned int trace_ctx; + int syscall_nr; + + syscall_nr = trace_get_syscall_nr(current, regs); +@@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) + if (!sys_data) + return; + +- local_save_flags(irq_flags); +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + + buffer = tr->array_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, + sys_data->exit_event->event.type, sizeof(*entry), +- irq_flags, pc); ++ trace_ctx); + if (!event) + return; + +@@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) + entry->ret = syscall_get_return_value(current, regs); + + event_trigger_unlock_commit(trace_file, buffer, event, entry, +- irq_flags, pc); ++ trace_ctx); + } + + static int reg_event_syscall_enter(struct trace_event_file *file, +diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c +index 3cf7128e1ad3..a1ed96a7a462 100644 +--- a/kernel/trace/trace_uprobe.c ++++ b/kernel/trace/trace_uprobe.c +@@ -961,7 +961,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = esize + tu->tp.size + dsize; + event = trace_event_buffer_lock_reserve(&buffer, trace_file, +- call->event.type, size, 0, 0); ++ call->event.type, size, 0); + if (!event) + return; + +@@ -977,7 +977,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, + + memcpy(data, ucb->buf, tu->tp.size + dsize); + +- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); ++ event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); + } + + /* uprobe handler */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 0c781f912f9f..16fcda68c2b6 100644 +index 7937265ef879..74c7913df9dd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug -@@ -1332,7 +1332,7 @@ config DEBUG_ATOMIC_SLEEP +@@ -1353,7 +1353,7 @@ config DEBUG_ATOMIC_SLEEP config DEBUG_LOCKING_API_SELFTESTS bool "Locking API boot-time self-tests" @@ -21736,37 +17645,23 @@ index 0c781f912f9f..16fcda68c2b6 100644 help Say Y here if you want the kernel to run a short self-test during bootup. The self-test checks whether common types of locking bugs -diff --git a/lib/cpumask.c b/lib/cpumask.c -index 85da6ab4fbb5..35924025097b 100644 ---- a/lib/cpumask.c -+++ b/lib/cpumask.c -@@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p, - return next; +diff --git a/lib/bug.c b/lib/bug.c +index 7103440c0ee1..baf61c307a6a 100644 +--- a/lib/bug.c ++++ b/lib/bug.c +@@ -205,6 +205,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) + else + pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", + (void *)bugaddr); ++ pr_flush(1000, true); + + return BUG_TRAP_TYPE_BUG; } - EXPORT_SYMBOL(cpumask_any_and_distribute); -+ -+int cpumask_any_distribute(const struct cpumask *srcp) -+{ -+ int next, prev; -+ -+ /* NOTE: our first selection will skip 0. */ -+ prev = __this_cpu_read(distribute_cpu_mask_prev); -+ -+ next = cpumask_next(prev, srcp); -+ if (next >= nr_cpu_ids) -+ next = cpumask_first(srcp); -+ -+ if (next < nr_cpu_ids) -+ __this_cpu_write(distribute_cpu_mask_prev, next); -+ -+ return next; -+} -+EXPORT_SYMBOL(cpumask_any_distribute); diff --git a/lib/debugobjects.c b/lib/debugobjects.c -index fe4557955d97..f8bfd257d0bb 100644 +index 9e14ae02306b..083882a3cf2f 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c -@@ -537,7 +537,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) +@@ -557,7 +557,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack struct debug_obj *obj; unsigned long flags; @@ -21778,26 +17673,6 @@ index fe4557955d97..f8bfd257d0bb 100644 db = get_bucket((unsigned long) addr); -diff --git a/lib/dump_stack.c b/lib/dump_stack.c -index a00ee6eedc7c..f5a33b6f773f 100644 ---- a/lib/dump_stack.c -+++ b/lib/dump_stack.c -@@ -12,6 +12,7 @@ - #include <linux/atomic.h> - #include <linux/kexec.h> - #include <linux/utsname.h> -+#include <linux/stop_machine.h> - - static char dump_stack_arch_desc_str[128]; - -@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl) - log_lvl, dump_stack_arch_desc_str); - - print_worker_info(log_lvl, current); -+ print_stop_info(log_lvl, current); - } - - /** diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2f17b488d58e..7557bf7ecf1f 100644 --- a/lib/irq_poll.c @@ -21843,10 +17718,10 @@ index 2f17b488d58e..7557bf7ecf1f 100644 return 0; } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c -index 14f44f59e733..b110d073d2d2 100644 +index 9959ea23529e..716a83aa79d8 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c -@@ -742,6 +742,8 @@ GENERATE_TESTCASE(init_held_rtmutex); +@@ -787,6 +787,8 @@ GENERATE_TESTCASE(init_held_rtmutex); #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin) @@ -21855,7 +17730,7 @@ index 14f44f59e733..b110d073d2d2 100644 #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) -@@ -757,9 +759,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) +@@ -802,9 +804,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) @@ -21868,7 +17743,7 @@ index 14f44f59e733..b110d073d2d2 100644 /* * Enabling hardirqs with a softirq-safe lock held: */ -@@ -792,6 +797,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) +@@ -837,6 +842,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) #undef E1 #undef E2 @@ -21877,7 +17752,7 @@ index 14f44f59e733..b110d073d2d2 100644 /* * Enabling irqs with an irq-safe lock held: */ -@@ -815,6 +822,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) +@@ -860,6 +867,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin) @@ -21886,7 +17761,7 @@ index 14f44f59e733..b110d073d2d2 100644 #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) -@@ -830,6 +839,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) +@@ -875,6 +884,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) @@ -21895,7 +17770,7 @@ index 14f44f59e733..b110d073d2d2 100644 #undef E1 #undef E2 -@@ -861,6 +872,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) +@@ -906,6 +917,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin) @@ -21904,7 +17779,7 @@ index 14f44f59e733..b110d073d2d2 100644 #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) -@@ -876,6 +889,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) +@@ -921,6 +934,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) @@ -21913,7 +17788,7 @@ index 14f44f59e733..b110d073d2d2 100644 #undef E1 #undef E2 #undef E3 -@@ -909,6 +924,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) +@@ -954,6 +969,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin) @@ -21922,7 +17797,7 @@ index 14f44f59e733..b110d073d2d2 100644 #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) -@@ -924,10 +941,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) +@@ -969,10 +986,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) @@ -21937,10 +17812,11 @@ index 14f44f59e733..b110d073d2d2 100644 /* * read-lock / write-lock irq inversion. * -@@ -990,6 +1011,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) +@@ -1162,6 +1183,11 @@ GENERATE_PERMUTATIONS_3_EVENTS(W1W2_R2R3_R3W1) + #undef E1 #undef E2 #undef E3 - ++ +#endif + +#ifndef CONFIG_PREEMPT_RT @@ -21948,7 +17824,7 @@ index 14f44f59e733..b110d073d2d2 100644 /* * read-lock / write-lock recursion that is actually safe. */ -@@ -1028,6 +1053,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) +@@ -1208,6 +1234,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) #undef E2 #undef E3 @@ -21957,7 +17833,7 @@ index 14f44f59e733..b110d073d2d2 100644 /* * read-lock / write-lock recursion that is unsafe. */ -@@ -2058,6 +2085,7 @@ void locking_selftest(void) +@@ -2517,6 +2545,7 @@ void locking_selftest(void) printk(" --------------------------------------------------------------------------\n"); @@ -21965,10 +17841,10 @@ index 14f44f59e733..b110d073d2d2 100644 /* * irq-context testcases: */ -@@ -2070,6 +2098,28 @@ void locking_selftest(void) +@@ -2531,6 +2560,28 @@ void locking_selftest(void) + DO_TESTCASE_6x2x2RW("irq read-recursion #2", irq_read_recursion2); + DO_TESTCASE_6x2x2RW("irq read-recursion #3", irq_read_recursion3); - DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); - // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); +#else + /* On -rt, we only do hardirq context test for raw spinlock */ + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12); @@ -21991,11 +17867,11 @@ index 14f44f59e733..b110d073d2d2 100644 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321); +#endif - ww_tests(); + force_read_lock_recursive = 0; diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c -index 15ca78e1c7d4..77bf84987cda 100644 +index 8abe1870dba4..b09a490f5f70 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, @@ -22012,10 +17888,10 @@ index 15ca78e1c7d4..77bf84987cda 100644 put_cpu(); } diff --git a/lib/scatterlist.c b/lib/scatterlist.c -index 5d63a8857f36..5569dac27afe 100644 +index a59778946404..907f59045998 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c -@@ -811,7 +811,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) +@@ -892,7 +892,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) flush_kernel_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { @@ -22024,24 +17900,8 @@ index 5d63a8857f36..5569dac27afe 100644 kunmap_atomic(miter->addr); } else kunmap(miter->page); -diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c -index 525222e4f409..faaa927ac2c8 100644 ---- a/lib/smp_processor_id.c -+++ b/lib/smp_processor_id.c -@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) - if (current->nr_cpus_allowed == 1) - goto out; - -+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) -+ if (current->migration_disabled) -+ goto out; -+#endif -+ - /* - * It is valid to assume CPU-locality during early bootup: - */ diff --git a/mm/Kconfig b/mm/Kconfig -index 6c974888f86f..056460878a2b 100644 +index f730605b8dcf..97bce365ae26 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -387,7 +387,7 @@ config NOMMU_INITIAL_TRIM_EXCESS @@ -22053,36 +17913,11 @@ index 6c974888f86f..056460878a2b 100644 select COMPACTION select XARRAY_MULTI help -diff --git a/mm/highmem.c b/mm/highmem.c -index 64d8dea47dd1..7d3065719ce8 100644 ---- a/mm/highmem.c -+++ b/mm/highmem.c -@@ -31,8 +31,11 @@ - #include <asm/tlbflush.h> - #include <linux/vmalloc.h> - -+#ifndef CONFIG_PREEMPT_RT - #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) - DEFINE_PER_CPU(int, __kmap_atomic_idx); -+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); -+#endif - #endif - - /* -@@ -108,8 +111,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) - atomic_long_t _totalhigh_pages __read_mostly; - EXPORT_SYMBOL(_totalhigh_pages); - --EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); -- - unsigned int nr_free_highpages (void) - { - struct zone *zone; diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 6877c765b8d0..f35ffe52cc4f 100644 +index 913c2b9e5c72..da25645e5c47 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c -@@ -63,6 +63,7 @@ +@@ -66,6 +66,7 @@ #include <net/sock.h> #include <net/ip.h> #include "slab.h" @@ -22090,7 +17925,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 #include <linux/uaccess.h> -@@ -90,6 +91,13 @@ bool cgroup_memory_noswap __read_mostly; +@@ -96,6 +97,13 @@ bool cgroup_memory_noswap __read_mostly; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -22104,7 +17939,23 @@ index 6877c765b8d0..f35ffe52cc4f 100644 /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { -@@ -2154,6 +2162,7 @@ void unlock_page_memcg(struct page *page) +@@ -805,6 +813,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + ++ preempt_disable_rt(); + /* Update memcg */ + __mod_memcg_state(memcg, idx, val); + +@@ -824,6 +833,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); ++ preempt_enable_rt(); + } + + /** +@@ -2236,6 +2246,7 @@ void unlock_page_memcg(struct page *page) EXPORT_SYMBOL(unlock_page_memcg); struct memcg_stock_pcp { @@ -22112,7 +17963,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; -@@ -2205,7 +2214,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +@@ -2287,7 +2298,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; @@ -22121,7 +17972,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { -@@ -2213,7 +2222,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +@@ -2295,7 +2306,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } @@ -22130,7 +17981,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 return ret; } -@@ -2248,14 +2257,14 @@ static void drain_local_stock(struct work_struct *dummy) +@@ -2330,14 +2341,14 @@ static void drain_local_stock(struct work_struct *dummy) * The only protection from memory hotplug vs. drain_stock races is * that we always operate on local CPU stock here with IRQ disabled */ @@ -22147,7 +17998,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 } /* -@@ -2267,7 +2276,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +@@ -2349,7 +2360,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) struct memcg_stock_pcp *stock; unsigned long flags; @@ -22156,7 +18007,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ -@@ -2280,7 +2289,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +@@ -2362,7 +2373,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); @@ -22165,7 +18016,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 } /* -@@ -2300,7 +2309,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) +@@ -2382,7 +2393,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) * as well as workers from this path always operate on the local * per-cpu data. CPU up doesn't touch memcg_stock at all. */ @@ -22174,7 +18025,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; -@@ -2323,7 +2332,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) +@@ -2405,7 +2416,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) schedule_work_on(cpu, &stock->work); } } @@ -22183,7 +18034,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 mutex_unlock(&percpu_charge_mutex); } -@@ -3084,7 +3093,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +@@ -3169,7 +3180,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) unsigned long flags; bool ret = false; @@ -22192,7 +18043,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { -@@ -3092,7 +3101,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +@@ -3177,7 +3188,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) ret = true; } @@ -22201,7 +18052,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 return ret; } -@@ -3151,7 +3160,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +@@ -3236,7 +3247,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) struct memcg_stock_pcp *stock; unsigned long flags; @@ -22210,7 +18061,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ -@@ -3165,7 +3174,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +@@ -3250,7 +3261,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) if (stock->nr_bytes > PAGE_SIZE) drain_obj_stock(stock); @@ -22219,7 +18070,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) -@@ -5681,12 +5690,12 @@ static int mem_cgroup_move_account(struct page *page, +@@ -5699,12 +5710,12 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; @@ -22234,7 +18085,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 out_unlock: unlock_page(page); out: -@@ -6722,10 +6731,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) +@@ -6755,10 +6766,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) css_get(&memcg->css); commit_charge(page, memcg); @@ -22247,7 +18098,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 if (PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; -@@ -6769,11 +6778,11 @@ static void uncharge_batch(const struct uncharge_gather *ug) +@@ -6802,11 +6813,11 @@ static void uncharge_batch(const struct uncharge_gather *ug) memcg_oom_recover(ug->memcg); } @@ -22261,7 +18112,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 /* drop reference from uncharge_page */ css_put(&ug->memcg->css); -@@ -6927,10 +6936,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) +@@ -6958,10 +6969,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) css_get(&memcg->css); commit_charge(newpage, memcg); @@ -22274,7 +18125,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); -@@ -7050,9 +7059,13 @@ static int __init mem_cgroup_init(void) +@@ -7081,9 +7092,13 @@ static int __init mem_cgroup_init(void) cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); @@ -22291,7 +18142,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; -@@ -7101,6 +7114,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +@@ -7132,6 +7147,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg, *swap_memcg; unsigned int nr_entries; unsigned short oldid; @@ -22299,7 +18150,7 @@ index 6877c765b8d0..f35ffe52cc4f 100644 VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); -@@ -7146,9 +7160,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +@@ -7180,9 +7196,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ @@ -22314,10 +18165,10 @@ index 6877c765b8d0..f35ffe52cc4f 100644 css_put(&memcg->css); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 780c8f023b28..f581204e3f35 100644 +index 519a60d5b6f7..36f314ae0e56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c -@@ -61,6 +61,7 @@ +@@ -62,6 +62,7 @@ #include <linux/hugetlb.h> #include <linux/sched/rt.h> #include <linux/sched/mm.h> @@ -22325,9 +18176,9 @@ index 780c8f023b28..f581204e3f35 100644 #include <linux/page_owner.h> #include <linux/kthread.h> #include <linux/memcontrol.h> -@@ -357,6 +358,13 @@ EXPORT_SYMBOL(nr_node_ids); - EXPORT_SYMBOL(nr_online_nodes); - #endif +@@ -363,6 +364,13 @@ EXPORT_SYMBOL(nr_online_nodes); + + int page_group_by_mobility_disabled __read_mostly; +struct pa_lock { + local_lock_t l; @@ -22336,178 +18187,55 @@ index 780c8f023b28..f581204e3f35 100644 + .l = INIT_LOCAL_LOCK(l), +}; + - int page_group_by_mobility_disabled __read_mostly; - #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -@@ -1283,7 +1291,7 @@ static inline void prefetch_buddy(struct page *page) - } - /* -- * Frees a number of pages from the PCP lists -+ * Frees a number of pages which have been collected from the pcp lists. - * Assumes all pages on list are in same zone, and of same order. - * count is the number of pages to free. - * -@@ -1293,15 +1301,56 @@ static inline void prefetch_buddy(struct page *page) - * And clear the zone's pages_scanned counter, to hold off the "all pages are - * pinned" detection logic. - */ --static void free_pcppages_bulk(struct zone *zone, int count, -- struct per_cpu_pages *pcp) -+static void free_pcppages_bulk(struct zone *zone, struct list_head *head, -+ bool zone_retry) -+{ -+ bool isolated_pageblocks; -+ struct page *page, *tmp; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&zone->lock, flags); -+ isolated_pageblocks = has_isolate_pageblock(zone); -+ -+ /* -+ * Use safe version since after __free_one_page(), -+ * page->lru.next will not point to original list. -+ */ -+ list_for_each_entry_safe(page, tmp, head, lru) { -+ int mt = get_pcppage_migratetype(page); -+ -+ if (page_zone(page) != zone) { -+ /* -+ * free_unref_page_list() sorts pages by zone. If we end -+ * up with pages from a different NUMA nodes belonging -+ * to the same ZONE index then we need to redo with the -+ * correct ZONE pointer. Skip the page for now, redo it -+ * on the next iteration. -+ */ -+ WARN_ON_ONCE(zone_retry == false); -+ if (zone_retry) -+ continue; -+ } -+ -+ /* MIGRATE_ISOLATE page should not go to pcplists */ -+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); -+ /* Pageblock could have been isolated meanwhile */ -+ if (unlikely(isolated_pageblocks)) -+ mt = get_pageblock_migratetype(page); -+ -+ list_del(&page->lru); -+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); -+ trace_mm_page_pcpu_drain(page, 0, mt); -+ } -+ spin_unlock_irqrestore(&zone->lock, flags); -+} -+ -+static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp, -+ struct list_head *dst) - { - int migratetype = 0; - int batch_free = 0; - int prefetch_nr = 0; -- bool isolated_pageblocks; -- struct page *page, *tmp; -- LIST_HEAD(head); -+ struct page *page; - - /* - * Ensure proper count is passed which otherwise would stuck in the -@@ -1338,7 +1387,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, - if (bulkfree_pcp_prepare(page)) - continue; - -- list_add_tail(&page->lru, &head); -+ list_add_tail(&page->lru, dst); - - /* - * We are going to put the page back to the global -@@ -1353,26 +1402,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, - prefetch_buddy(page); - } while (--count && --batch_free && !list_empty(list)); - } -- -- spin_lock(&zone->lock); -- isolated_pageblocks = has_isolate_pageblock(zone); -- -- /* -- * Use safe version since after __free_one_page(), -- * page->lru.next will not point to original list. -- */ -- list_for_each_entry_safe(page, tmp, &head, lru) { -- int mt = get_pcppage_migratetype(page); -- /* MIGRATE_ISOLATE page should not go to pcplists */ -- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); -- /* Pageblock could have been isolated meanwhile */ -- if (unlikely(isolated_pageblocks)) -- mt = get_pageblock_migratetype(page); -- -- __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); -- trace_mm_page_pcpu_drain(page, 0, mt); -- } -- spin_unlock(&zone->lock); - } - - static void free_one_page(struct zone *zone, -@@ -1473,10 +1502,10 @@ static void __free_pages_ok(struct page *page, unsigned int order) + * During boot we initialize deferred pages on-demand, as needed, but once +@@ -1537,11 +1545,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, return; migratetype = get_pfnblock_migratetype(page, pfn); - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype, + fpi_flags); - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); } void __free_pages_core(struct page *page, unsigned int order) -@@ -2877,13 +2906,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) - { +@@ -2957,12 +2965,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) unsigned long flags; int to_drain, batch; -+ LIST_HEAD(dst); - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) -- free_pcppages_bulk(zone, to_drain, pcp); + free_pcppages_bulk(zone, to_drain, pcp); - local_irq_restore(flags); -+ isolate_pcp_pages(to_drain, pcp, &dst); -+ + local_unlock_irqrestore(&pa_lock.l, flags); -+ -+ if (to_drain > 0) -+ free_pcppages_bulk(zone, &dst, false); } #endif -@@ -2899,14 +2933,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) - unsigned long flags; +@@ -2979,13 +2987,13 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; -+ LIST_HEAD(dst); -+ int count; - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; -- if (pcp->count) -- free_pcppages_bulk(zone, pcp->count, pcp); + if (pcp->count) + free_pcppages_bulk(zone, pcp->count, pcp); - local_irq_restore(flags); -+ count = pcp->count; -+ if (count) -+ isolate_pcp_pages(count, pcp, &dst); -+ + local_unlock_irqrestore(&pa_lock.l, flags); -+ -+ if (count) -+ free_pcppages_bulk(zone, &dst, false); } /* -@@ -2954,9 +2995,9 @@ static void drain_local_pages_wq(struct work_struct *work) +@@ -3033,9 +3041,9 @@ static void drain_local_pages_wq(struct work_struct *work) * cpu which is allright but we also have to make sure to not move to * a different one. */ @@ -22519,60 +18247,19 @@ index 780c8f023b28..f581204e3f35 100644 } /* -@@ -3105,7 +3146,8 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) - return true; - } - --static void free_unref_page_commit(struct page *page, unsigned long pfn) -+static void free_unref_page_commit(struct page *page, unsigned long pfn, -+ struct list_head *dst) - { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; -@@ -3134,7 +3176,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) - pcp->count++; - if (pcp->count >= pcp->high) { - unsigned long batch = READ_ONCE(pcp->batch); -- free_pcppages_bulk(zone, batch, pcp); -+ -+ isolate_pcp_pages(batch, pcp, dst); - } - } - -@@ -3145,13 +3188,17 @@ void free_unref_page(struct page *page) - { - unsigned long flags; - unsigned long pfn = page_to_pfn(page); -+ struct zone *zone = page_zone(page); -+ LIST_HEAD(dst); - +@@ -3248,9 +3256,9 @@ void free_unref_page(struct page *page) if (!free_unref_page_prepare(page, pfn)) return; - local_irq_save(flags); -- free_unref_page_commit(page, pfn); -- local_irq_restore(flags); + local_lock_irqsave(&pa_lock.l, flags); -+ free_unref_page_commit(page, pfn, &dst); + free_unref_page_commit(page, pfn); +- local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); -+ if (!list_empty(&dst)) -+ free_pcppages_bulk(zone, &dst, false); } /* -@@ -3162,6 +3209,11 @@ void free_unref_page_list(struct list_head *list) - struct page *page, *next; - unsigned long flags, pfn; - int batch_count = 0; -+ struct list_head dsts[__MAX_NR_ZONES]; -+ int i; -+ -+ for (i = 0; i < __MAX_NR_ZONES; i++) -+ INIT_LIST_HEAD(&dsts[i]); - - /* Prepare pages for freeing */ - list_for_each_entry_safe(page, next, list, lru) { -@@ -3171,25 +3223,42 @@ void free_unref_page_list(struct list_head *list) +@@ -3270,7 +3278,7 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, pfn); } @@ -22580,16 +18267,8 @@ index 780c8f023b28..f581204e3f35 100644 + local_lock_irqsave(&pa_lock.l, flags); list_for_each_entry_safe(page, next, list, lru) { unsigned long pfn = page_private(page); -+ enum zone_type type; - set_page_private(page, 0); - trace_mm_page_free_batched(page); -- free_unref_page_commit(page, pfn); -+ type = page_zonenum(page); -+ free_unref_page_commit(page, pfn, &dsts[type]); - - /* - * Guard against excessive IRQ disabled times when we get +@@ -3283,12 +3291,12 @@ void free_unref_page_list(struct list_head *list) * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { @@ -22602,25 +18281,10 @@ index 780c8f023b28..f581204e3f35 100644 } - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); -+ -+ for (i = 0; i < __MAX_NR_ZONES; ) { -+ struct page *page; -+ struct zone *zone; -+ -+ if (list_empty(&dsts[i])) { -+ i++; -+ continue; -+ } -+ -+ page = list_first_entry(&dsts[i], struct page, lru); -+ zone = page_zone(page); -+ -+ free_pcppages_bulk(zone, &dsts[i], true); -+ } } /* -@@ -3343,7 +3412,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, +@@ -3443,7 +3451,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct page *page; unsigned long flags; @@ -22629,7 +18293,7 @@ index 780c8f023b28..f581204e3f35 100644 pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); -@@ -3351,7 +3420,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, +@@ -3451,7 +3459,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } @@ -22638,17 +18302,24 @@ index 780c8f023b28..f581204e3f35 100644 return page; } -@@ -3385,7 +3454,8 @@ struct page *rmqueue(struct zone *preferred_zone, +@@ -3485,7 +3493,9 @@ struct page *rmqueue(struct zone *preferred_zone, * allocate greater than order-1 page units with __GFP_NOFAIL. */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); ++ + local_lock_irqsave(&pa_lock.l, flags); + spin_lock(&zone->lock); do { page = NULL; -@@ -3411,7 +3481,7 @@ struct page *rmqueue(struct zone *preferred_zone, +@@ -3506,12 +3516,13 @@ struct page *rmqueue(struct zone *preferred_zone, + spin_unlock(&zone->lock); + if (!page) + goto failed; ++ + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); @@ -22657,7 +18328,7 @@ index 780c8f023b28..f581204e3f35 100644 out: /* Separate test+clear to avoid unnecessary atomics */ -@@ -3424,7 +3494,7 @@ struct page *rmqueue(struct zone *preferred_zone, +@@ -3524,7 +3535,7 @@ struct page *rmqueue(struct zone *preferred_zone, return page; failed: @@ -22666,7 +18337,7 @@ index 780c8f023b28..f581204e3f35 100644 return NULL; } -@@ -8697,7 +8767,7 @@ void zone_pcp_reset(struct zone *zone) +@@ -8828,7 +8839,7 @@ void zone_pcp_reset(struct zone *zone) struct per_cpu_pageset *pset; /* avoid races with drain_pages() */ @@ -22675,7 +18346,7 @@ index 780c8f023b28..f581204e3f35 100644 if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); -@@ -8706,7 +8776,7 @@ void zone_pcp_reset(struct zone *zone) +@@ -8837,7 +8848,7 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } @@ -22685,7 +18356,7 @@ index 780c8f023b28..f581204e3f35 100644 #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/shmem.c b/mm/shmem.c -index 8e2b35ba93ad..e029b943ebed 100644 +index 7c6b6d8f6c39..6a64c3bfecad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -278,10 +278,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) @@ -22752,7 +18423,7 @@ index 8e2b35ba93ad..e029b943ebed 100644 } return mpol; } -@@ -3582,9 +3583,10 @@ static int shmem_reconfigure(struct fs_context *fc) +@@ -3587,9 +3588,10 @@ static int shmem_reconfigure(struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; @@ -22764,7 +18435,7 @@ index 8e2b35ba93ad..e029b943ebed 100644 inodes = sbinfo->max_inodes - sbinfo->free_inodes; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { -@@ -3629,14 +3631,15 @@ static int shmem_reconfigure(struct fs_context *fc) +@@ -3634,14 +3636,15 @@ static int shmem_reconfigure(struct fs_context *fc) * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { @@ -22783,7 +18454,7 @@ index 8e2b35ba93ad..e029b943ebed 100644 return invalfc(fc, "%s", err); } -@@ -3753,7 +3756,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) +@@ -3758,7 +3761,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -22793,7 +18464,7 @@ index 8e2b35ba93ad..e029b943ebed 100644 goto failed; spin_lock_init(&sbinfo->shrinklist_lock); diff --git a/mm/slab.c b/mm/slab.c -index f658e86ec8ce..3dbddaad8a32 100644 +index d7c8da9319c7..1fa2155b9a80 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -233,7 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) @@ -22928,7 +18599,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", node, total_slabs - free_slabs, total_slabs, -@@ -2106,7 +2106,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) +@@ -2107,7 +2107,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); @@ -22937,7 +18608,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 #endif } -@@ -2114,7 +2114,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) +@@ -2115,7 +2115,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); @@ -22946,7 +18617,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 #endif } -@@ -2154,9 +2154,9 @@ static void do_drain(void *arg) +@@ -2155,9 +2155,9 @@ static void do_drain(void *arg) check_irq_off(); ac = cpu_cache_get(cachep); n = get_node(cachep, node); @@ -22958,7 +18629,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 ac->avail = 0; slabs_destroy(cachep, &list); } -@@ -2174,9 +2174,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) +@@ -2175,9 +2175,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) drain_alien_cache(cachep, n->alien); for_each_kmem_cache_node(cachep, node, n) { @@ -22970,7 +18641,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 slabs_destroy(cachep, &list); } -@@ -2198,10 +2198,10 @@ static int drain_freelist(struct kmem_cache *cache, +@@ -2199,10 +2199,10 @@ static int drain_freelist(struct kmem_cache *cache, nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { @@ -22983,7 +18654,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 goto out; } -@@ -2214,7 +2214,7 @@ static int drain_freelist(struct kmem_cache *cache, +@@ -2215,7 +2215,7 @@ static int drain_freelist(struct kmem_cache *cache, * to the cache. */ n->free_objects -= cache->num; @@ -22992,7 +18663,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 slab_destroy(cache, page); nr_freed++; } -@@ -2652,7 +2652,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) +@@ -2651,7 +2651,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) INIT_LIST_HEAD(&page->slab_list); n = get_node(cachep, page_to_nid(page)); @@ -23001,7 +18672,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 n->total_slabs++; if (!page->active) { list_add_tail(&page->slab_list, &n->slabs_free); -@@ -2662,7 +2662,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) +@@ -2661,7 +2661,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; @@ -23010,7 +18681,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 fixup_objfreelist_debug(cachep, &list); } -@@ -2828,7 +2828,7 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) +@@ -2827,7 +2827,7 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; @@ -23019,7 +18690,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 page = list_first_entry_or_null(&n->slabs_partial, struct page, slab_list); if (!page) { -@@ -2855,10 +2855,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, +@@ -2854,10 +2854,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, if (!gfp_pfmemalloc_allowed(flags)) return NULL; @@ -23032,7 +18703,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 return NULL; } -@@ -2867,7 +2867,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, +@@ -2866,7 +2866,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, fixup_slab_list(cachep, n, page, &list); @@ -23041,7 +18712,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 fixup_objfreelist_debug(cachep, &list); return obj; -@@ -2926,7 +2926,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +@@ -2925,7 +2925,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) if (!n->free_objects && (!shared || !shared->avail)) goto direct_grow; @@ -23050,7 +18721,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ -@@ -2950,7 +2950,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +@@ -2949,7 +2949,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) must_grow: n->free_objects -= ac->avail; alloc_done: @@ -23059,7 +18730,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 fixup_objfreelist_debug(cachep, &list); direct_grow: -@@ -3175,7 +3175,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, +@@ -3174,7 +3174,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, BUG_ON(!n); check_irq_off(); @@ -23068,7 +18739,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 page = get_first_slab(n, false); if (!page) goto must_grow; -@@ -3193,12 +3193,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, +@@ -3192,12 +3192,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, fixup_slab_list(cachep, n, page, &list); @@ -23083,7 +18754,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); if (page) { /* This slab isn't counted yet so don't update free_objects */ -@@ -3376,7 +3376,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +@@ -3375,7 +3375,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) check_irq_off(); n = get_node(cachep, node); @@ -23092,7 +18763,7 @@ index f658e86ec8ce..3dbddaad8a32 100644 if (n->shared) { struct array_cache *shared_array = n->shared; int max = shared_array->limit - shared_array->avail; -@@ -3405,7 +3405,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +@@ -3404,7 +3404,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) STATS_SET_FREEABLE(cachep, i); } #endif @@ -23144,10 +18815,10 @@ index f658e86ec8ce..3dbddaad8a32 100644 num_objs = total_slabs * cachep->num; active_slabs = total_slabs - free_slabs; diff --git a/mm/slab.h b/mm/slab.h -index 6cc323f1313a..089bcef627e6 100644 +index 1a756a359fa8..7caf7dcc5fba 100644 --- a/mm/slab.h +++ b/mm/slab.h -@@ -530,7 +530,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, +@@ -523,7 +523,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * The slab lists for all objects. */ struct kmem_cache_node { @@ -23157,10 +18828,10 @@ index 6cc323f1313a..089bcef627e6 100644 #ifdef CONFIG_SLAB struct list_head slabs_partial; /* partial list first, better asm code */ diff --git a/mm/slub.c b/mm/slub.c -index 6d3574013b2f..795b9a3488df 100644 +index b22a4b101c84..0e317cbb8c25 100644 --- a/mm/slub.c +++ b/mm/slub.c -@@ -434,7 +434,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, +@@ -436,7 +436,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; @@ -23169,7 +18840,7 @@ index 6d3574013b2f..795b9a3488df 100644 /* * Determine a map of object in use on a page. -@@ -450,7 +450,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) +@@ -452,7 +452,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) VM_BUG_ON(!irqs_disabled()); @@ -23178,7 +18849,7 @@ index 6d3574013b2f..795b9a3488df 100644 bitmap_zero(object_map, page->objects); -@@ -463,7 +463,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) +@@ -465,7 +465,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); @@ -23187,7 +18858,7 @@ index 6d3574013b2f..795b9a3488df 100644 } static inline unsigned int size_from_object(struct kmem_cache *s) -@@ -1213,7 +1213,7 @@ static noinline int free_debug_processing( +@@ -1216,7 +1216,7 @@ static noinline int free_debug_processing( unsigned long flags; int ret = 0; @@ -23196,7 +18867,7 @@ index 6d3574013b2f..795b9a3488df 100644 slab_lock(page); if (s->flags & SLAB_CONSISTENCY_CHECKS) { -@@ -1248,7 +1248,7 @@ static noinline int free_debug_processing( +@@ -1251,7 +1251,7 @@ static noinline int free_debug_processing( bulk_cnt, cnt); slab_unlock(page); @@ -23205,20 +18876,7 @@ index 6d3574013b2f..795b9a3488df 100644 if (!ret) slab_fix(s, "Object at 0x%p not freed", object); return ret; -@@ -1496,6 +1496,12 @@ static bool freelist_corrupted(struct kmem_cache *s, struct page *page, - } - #endif /* CONFIG_SLUB_DEBUG */ - -+struct slub_free_list { -+ raw_spinlock_t lock; -+ struct list_head list; -+}; -+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); -+ - /* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. -@@ -1739,10 +1745,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +@@ -1739,10 +1739,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) void *start, *p, *next; int idx; bool shuffle; @@ -23237,7 +18895,7 @@ index 6d3574013b2f..795b9a3488df 100644 local_irq_enable(); flags |= s->allocflags; -@@ -1801,7 +1815,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +@@ -1803,7 +1811,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: @@ -23246,37 +18904,57 @@ index 6d3574013b2f..795b9a3488df 100644 local_irq_disable(); if (!page) return NULL; -@@ -1844,6 +1858,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) - __free_pages(page, order); +@@ -1861,12 +1869,29 @@ static void free_slab(struct kmem_cache *s, struct page *page) + __free_slab(s, page); + } + ++static void discard_slab_delayed(struct kmem_cache *s, struct page *page, ++ struct list_head *delayed_free) ++{ ++ dec_slabs_node(s, page_to_nid(page), page->objects); ++ list_add(&page->lru, delayed_free); ++} ++ + static void discard_slab(struct kmem_cache *s, struct page *page) + { + dec_slabs_node(s, page_to_nid(page), page->objects); + free_slab(s, page); } -+static void free_delayed(struct list_head *h) ++static void discard_delayed(struct list_head *l) +{ -+ while (!list_empty(h)) { -+ struct page *page = list_first_entry(h, struct page, lru); ++ while (!list_empty(l)) { ++ struct page *page = list_first_entry(l, struct page, lru); + + list_del(&page->lru); + __free_slab(page->slab_cache, page); + } +} + - static void rcu_free_slab(struct rcu_head *h) - { - struct page *page = container_of(h, struct page, rcu_head); -@@ -1855,6 +1879,12 @@ static void free_slab(struct kmem_cache *s, struct page *page) - { - if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { - call_rcu(&page->rcu_head, rcu_free_slab); -+ } else if (irqs_disabled()) { -+ struct slub_free_list *f = this_cpu_ptr(&slub_free_list); -+ -+ raw_spin_lock(&f->lock); -+ list_add(&page->lru, &f->list); -+ raw_spin_unlock(&f->lock); - } else - __free_slab(s, page); + /* + * Management of partially allocated slabs. + */ +@@ -1940,15 +1965,16 @@ static inline void *acquire_slab(struct kmem_cache *s, + WARN_ON(!freelist); + return freelist; } -@@ -1962,7 +1992,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, +- +-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); ++static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain, ++ struct list_head *delayed_free); + static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); + + /* + * Try to allocate a partial slab from a specific node. + */ + static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, +- struct kmem_cache_cpu *c, gfp_t flags) ++ struct kmem_cache_cpu *c, gfp_t flags, ++ struct list_head *delayed_free) + { + struct page *page, *page2; + void *object = NULL; +@@ -1964,7 +1990,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!n || !n->nr_partial) return NULL; @@ -23285,7 +18963,16 @@ index 6d3574013b2f..795b9a3488df 100644 list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; -@@ -1987,7 +2017,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, +@@ -1981,7 +2007,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + stat(s, ALLOC_FROM_PARTIAL); + object = t; + } else { +- put_cpu_partial(s, page, 0); ++ put_cpu_partial(s, page, 0, delayed_free); + stat(s, CPU_PARTIAL_NODE); + } + if (!kmem_cache_has_cpu_partial(s) +@@ -1989,7 +2015,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, break; } @@ -23294,7 +18981,61 @@ index 6d3574013b2f..795b9a3488df 100644 return object; } -@@ -2241,7 +2271,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, +@@ -1997,7 +2023,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + * Get a page from somewhere. Search in increasing NUMA distances. + */ + static void *get_any_partial(struct kmem_cache *s, gfp_t flags, +- struct kmem_cache_cpu *c) ++ struct kmem_cache_cpu *c, ++ struct list_head *delayed_free) + { + #ifdef CONFIG_NUMA + struct zonelist *zonelist; +@@ -2039,7 +2066,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + + if (n && cpuset_zone_allowed(zone, flags) && + n->nr_partial > s->min_partial) { +- object = get_partial_node(s, n, c, flags); ++ object = get_partial_node(s, n, c, flags, delayed_free); + if (object) { + /* + * Don't check read_mems_allowed_retry() +@@ -2061,7 +2088,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + * Get a partial page, lock it and return it. + */ + static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, +- struct kmem_cache_cpu *c) ++ struct kmem_cache_cpu *c, ++ struct list_head *delayed_free) + { + void *object; + int searchnode = node; +@@ -2069,11 +2097,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); + +- object = get_partial_node(s, get_node(s, searchnode), c, flags); ++ object = get_partial_node(s, get_node(s, searchnode), c, flags, ++ delayed_free); + if (object || node != NUMA_NO_NODE) + return object; + +- return get_any_partial(s, flags, c); ++ return get_any_partial(s, flags, c, delayed_free); + } + + #ifdef CONFIG_PREEMPTION +@@ -2149,7 +2178,8 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) + * Remove the cpu slab + */ + static void deactivate_slab(struct kmem_cache *s, struct page *page, +- void *freelist, struct kmem_cache_cpu *c) ++ void *freelist, struct kmem_cache_cpu *c, ++ struct list_head *delayed_free) + { + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); +@@ -2243,7 +2273,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * that acquire_slab() will see a slab page that * is frozen */ @@ -23303,7 +19044,7 @@ index 6d3574013b2f..795b9a3488df 100644 } } else { m = M_FULL; -@@ -2252,7 +2282,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, +@@ -2254,7 +2284,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * slabs from diagnostic functions will not see * any frozen slabs. */ @@ -23312,7 +19053,7 @@ index 6d3574013b2f..795b9a3488df 100644 } } -@@ -2276,7 +2306,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, +@@ -2278,7 +2308,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, goto redo; if (lock) @@ -23321,7 +19062,27 @@ index 6d3574013b2f..795b9a3488df 100644 if (m == M_PARTIAL) stat(s, tail); -@@ -2315,10 +2345,10 @@ static void unfreeze_partials(struct kmem_cache *s, +@@ -2286,7 +2316,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + stat(s, DEACTIVATE_FULL); + else if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); +- discard_slab(s, page); ++ discard_slab_delayed(s, page, delayed_free); + stat(s, FREE_SLAB); + } + +@@ -2301,8 +2331,8 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). + */ +-static void unfreeze_partials(struct kmem_cache *s, +- struct kmem_cache_cpu *c) ++static void unfreeze_partials(struct kmem_cache *s, struct kmem_cache_cpu *c, ++ struct list_head *delayed_free) + { + #ifdef CONFIG_SLUB_CPU_PARTIAL + struct kmem_cache_node *n = NULL, *n2 = NULL; +@@ -2317,10 +2347,10 @@ static void unfreeze_partials(struct kmem_cache *s, n2 = get_node(s, page_to_nid(page)); if (n != n2) { if (n) @@ -23334,7 +19095,7 @@ index 6d3574013b2f..795b9a3488df 100644 } do { -@@ -2347,7 +2377,7 @@ static void unfreeze_partials(struct kmem_cache *s, +@@ -2349,14 +2379,14 @@ static void unfreeze_partials(struct kmem_cache *s, } if (n) @@ -23343,49 +19104,165 @@ index 6d3574013b2f..795b9a3488df 100644 while (discard_page) { page = discard_page; -@@ -2384,14 +2414,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) - pobjects = oldpage->pobjects; - pages = oldpage->pages; - if (drain && pobjects > slub_cpu_partial(s)) { -+ struct slub_free_list *f; - unsigned long flags; -+ LIST_HEAD(tofree); - /* - * partial array is full. Move the existing + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); +- discard_slab(s, page); ++ discard_slab_delayed(s, page, delayed_free); + stat(s, FREE_SLAB); + } + #endif /* CONFIG_SLUB_CPU_PARTIAL */ +@@ -2369,7 +2399,8 @@ static void unfreeze_partials(struct kmem_cache *s, + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) ++static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain, ++ struct list_head *delayed_free) + { + #ifdef CONFIG_SLUB_CPU_PARTIAL + struct page *oldpage; +@@ -2392,7 +2423,8 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) * set to the per node partial list. */ local_irq_save(flags); - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); -+ f = this_cpu_ptr(&slub_free_list); -+ raw_spin_lock(&f->lock); -+ list_splice_init(&f->list, &tofree); -+ raw_spin_unlock(&f->lock); +- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); ++ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab), ++ delayed_free); local_irq_restore(flags); -+ free_delayed(&tofree); oldpage = NULL; pobjects = 0; - pages = 0; -@@ -2459,7 +2496,19 @@ static bool has_cpu_slab(int cpu, void *info) +@@ -2414,17 +2446,18 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) + unsigned long flags; - static void flush_all(struct kmem_cache *s) + local_irq_save(flags); +- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); ++ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab), delayed_free); + local_irq_restore(flags); + } + preempt_enable(); + #endif /* CONFIG_SLUB_CPU_PARTIAL */ + } + +-static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) ++static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c, ++ struct list_head *delayed_free) { -+ LIST_HEAD(tofree); -+ int cpu; + stat(s, CPUSLAB_FLUSH); +- deactivate_slab(s, c->page, c->freelist, c); ++ deactivate_slab(s, c->page, c->freelist, c, delayed_free); + + c->tid = next_tid(c->tid); + } +@@ -2434,34 +2467,81 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + * + * Called from IPI handler with interrupts disabled. + */ +-static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) ++static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu, ++ struct list_head *delayed_free) + { + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + if (c->page) +- flush_slab(s, c); ++ flush_slab(s, c, delayed_free); + +- unfreeze_partials(s, c); ++ unfreeze_partials(s, c, delayed_free); + } + +-static void flush_cpu_slab(void *d) ++struct slub_flush_work { ++ struct work_struct work; ++ struct kmem_cache *s; ++ bool skip; ++}; ++ ++static void flush_cpu_slab(struct work_struct *w) + { +- struct kmem_cache *s = d; ++ struct slub_flush_work *sfw; ++ LIST_HEAD(delayed_free); + +- __flush_cpu_slab(s, smp_processor_id()); ++ sfw = container_of(w, struct slub_flush_work, work); ++ ++ local_irq_disable(); ++ __flush_cpu_slab(sfw->s, smp_processor_id(), &delayed_free); ++ local_irq_enable(); ++ ++ discard_delayed(&delayed_free); + } + +-static bool has_cpu_slab(int cpu, void *info) ++static bool has_cpu_slab(int cpu, struct kmem_cache *s) + { +- struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->page || slub_percpu_partial(c); + } + ++static DEFINE_MUTEX(flush_lock); ++static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); ++ ++static void flush_all_locked(struct kmem_cache *s) ++{ ++ struct slub_flush_work *sfw; ++ unsigned int cpu; ++ ++ mutex_lock(&flush_lock); + - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); + for_each_online_cpu(cpu) { -+ struct slub_free_list *f; ++ sfw = &per_cpu(slub_flush, cpu); ++ if (!has_cpu_slab(cpu, s)) { ++ sfw->skip = true; ++ continue; ++ } ++ INIT_WORK(&sfw->work, flush_cpu_slab); ++ sfw->skip = false; ++ sfw->s = s; ++ schedule_work_on(cpu, &sfw->work); ++ } + -+ f = &per_cpu(slub_free_list, cpu); -+ raw_spin_lock_irq(&f->lock); -+ list_splice_init(&f->list, &tofree); -+ raw_spin_unlock_irq(&f->lock); -+ free_delayed(&tofree); ++ for_each_online_cpu(cpu) { ++ sfw = &per_cpu(slub_flush, cpu); ++ if (sfw->skip) ++ continue; ++ flush_work(&sfw->work); + } ++ ++ mutex_unlock(&flush_lock); ++} ++ + static void flush_all(struct kmem_cache *s) + { +- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); ++ cpus_read_lock(); ++ flush_all_locked(s); ++ cpus_read_unlock(); } /* -@@ -2514,10 +2563,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, +@@ -2472,13 +2552,15 @@ static int slub_cpu_dead(unsigned int cpu) + { + struct kmem_cache *s; + unsigned long flags; ++ LIST_HEAD(delayed_free); + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + local_irq_save(flags); +- __flush_cpu_slab(s, cpu); ++ __flush_cpu_slab(s, cpu, &delayed_free); + local_irq_restore(flags); + } ++ discard_delayed(&delayed_free); + mutex_unlock(&slab_mutex); + return 0; + } +@@ -2516,10 +2598,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, unsigned long x = 0; struct page *page; @@ -23398,70 +19275,100 @@ index 6d3574013b2f..795b9a3488df 100644 return x; } #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ -@@ -2656,8 +2705,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) +@@ -2562,7 +2644,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) + } + + static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, +- int node, struct kmem_cache_cpu **pc) ++ int node, struct kmem_cache_cpu **pc, ++ struct list_head *delayed_free) + { + void *freelist; + struct kmem_cache_cpu *c = *pc; +@@ -2570,7 +2653,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + +- freelist = get_partial(s, flags, node, c); ++ freelist = get_partial(s, flags, node, c, delayed_free); + + if (freelist) + return freelist; +@@ -2579,7 +2662,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + if (page) { + c = raw_cpu_ptr(s->cpu_slab); + if (c->page) +- flush_slab(s, c); ++ flush_slab(s, c, delayed_free); + + /* + * No other reference to the page yet so we can +@@ -2658,7 +2741,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, -+ struct list_head *to_free) ++ struct list_head *delayed_free) { -+ struct slub_free_list *f; void *freelist; struct page *page; +@@ -2688,7 +2772,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + goto redo; + } else { + stat(s, ALLOC_NODE_MISMATCH); +- deactivate_slab(s, page, c->freelist, c); ++ deactivate_slab(s, page, c->freelist, c, delayed_free); + goto new_slab; + } + } +@@ -2699,7 +2783,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + * information when the page leaves the per-cpu allocator + */ + if (unlikely(!pfmemalloc_match(page, gfpflags))) { +- deactivate_slab(s, page, c->freelist, c); ++ deactivate_slab(s, page, c->freelist, c, delayed_free); + goto new_slab; + } -@@ -2723,6 +2774,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - VM_BUG_ON(!c->page->frozen); - c->freelist = get_freepointer(s, freelist); - c->tid = next_tid(c->tid); -+ -+out: -+ f = this_cpu_ptr(&slub_free_list); -+ raw_spin_lock(&f->lock); -+ list_splice_init(&f->list, to_free); -+ raw_spin_unlock(&f->lock); -+ - return freelist; +@@ -2738,7 +2822,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + goto redo; + } - new_slab: -@@ -2738,7 +2796,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +- freelist = new_slab_objects(s, gfpflags, node, &c); ++ freelist = new_slab_objects(s, gfpflags, node, &c, delayed_free); if (unlikely(!freelist)) { slab_out_of_memory(s, gfpflags, node); -- return NULL; -+ goto out; - } - - page = c->page; -@@ -2751,7 +2809,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -2754,7 +2838,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + !alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ - deactivate_slab(s, page, get_freepointer(s, freelist), c); -- return freelist; -+ goto out; +- deactivate_slab(s, page, get_freepointer(s, freelist), c); ++ deactivate_slab(s, page, get_freepointer(s, freelist), c, delayed_free); + return freelist; } - /* -@@ -2763,6 +2821,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -2767,6 +2851,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void *p; unsigned long flags; -+ LIST_HEAD(tofree); ++ LIST_HEAD(delayed_free); local_irq_save(flags); #ifdef CONFIG_PREEMPTION -@@ -2774,8 +2833,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -2778,8 +2863,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - p = ___slab_alloc(s, gfpflags, node, addr, c); -+ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree); ++ p = ___slab_alloc(s, gfpflags, node, addr, c, &delayed_free); local_irq_restore(flags); -+ free_delayed(&tofree); ++ discard_delayed(&delayed_free); return p; } -@@ -2809,6 +2869,10 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, +@@ -2814,6 +2900,10 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, unsigned long tid; struct obj_cgroup *objcg = NULL; @@ -23472,7 +19379,7 @@ index 6d3574013b2f..795b9a3488df 100644 s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) return NULL; -@@ -2975,7 +3039,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, +@@ -2979,7 +3069,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, do { if (unlikely(n)) { @@ -23481,7 +19388,7 @@ index 6d3574013b2f..795b9a3488df 100644 n = NULL; } prior = page->freelist; -@@ -3007,7 +3071,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, +@@ -3011,7 +3101,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */ @@ -23490,7 +19397,22 @@ index 6d3574013b2f..795b9a3488df 100644 } } -@@ -3048,7 +3112,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, +@@ -3030,11 +3120,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + */ + stat(s, FREE_FROZEN); + } else if (new.frozen) { ++ LIST_HEAD(delayed_free); + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ +- put_cpu_partial(s, page, 1); ++ put_cpu_partial(s, page, 1, &delayed_free); ++ discard_delayed(&delayed_free); + stat(s, CPU_PARTIAL_FREE); + } + +@@ -3053,7 +3145,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -23499,7 +19421,7 @@ index 6d3574013b2f..795b9a3488df 100644 return; slab_empty: -@@ -3063,7 +3127,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, +@@ -3068,7 +3160,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, remove_full(s, n, page); } @@ -23508,47 +19430,45 @@ index 6d3574013b2f..795b9a3488df 100644 stat(s, FREE_SLAB); discard_slab(s, page); } -@@ -3270,9 +3334,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) - { +@@ -3278,6 +3370,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, struct kmem_cache_cpu *c; -+ LIST_HEAD(to_free); int i; struct obj_cgroup *objcg = NULL; - ++ LIST_HEAD(delayed_free); ++ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP)) + WARN_ON_ONCE(!preemptible() && + (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND)); -+ + /* memcg and kmem_cache debug support */ s = slab_pre_alloc_hook(s, &objcg, size, flags); - if (unlikely(!s)) -@@ -3303,7 +3372,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, +@@ -3309,7 +3406,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c); -+ _RET_IP_, c, &to_free); ++ _RET_IP_, c, &delayed_free); if (unlikely(!p[i])) goto error; -@@ -3318,6 +3387,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - } +@@ -3325,6 +3422,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c->tid = next_tid(c->tid); local_irq_enable(); -+ free_delayed(&to_free); ++ discard_delayed(&delayed_free); ++ /* Clear memory outside IRQ disabled fastpath loop */ if (unlikely(slab_want_init_on_alloc(flags, s))) { -@@ -3332,6 +3402,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + int j; +@@ -3338,6 +3437,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, return i; error: local_irq_enable(); -+ free_delayed(&to_free); ++ discard_delayed(&delayed_free); slab_post_alloc_hook(s, objcg, flags, i, p); __kmem_cache_free_bulk(s, i, p); return 0; -@@ -3467,7 +3538,7 @@ static void +@@ -3487,7 +3587,7 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; @@ -23557,7 +19477,7 @@ index 6d3574013b2f..795b9a3488df 100644 INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); -@@ -3868,7 +3939,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) +@@ -3888,7 +3988,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) struct page *page, *h; BUG_ON(irqs_disabled()); @@ -23566,7 +19486,7 @@ index 6d3574013b2f..795b9a3488df 100644 list_for_each_entry_safe(page, h, &n->partial, slab_list) { if (!page->inuse) { remove_partial(n, page); -@@ -3878,7 +3949,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) +@@ -3898,7 +3998,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) "Objects remaining in %s on __kmem_cache_shutdown()"); } } @@ -23575,7 +19495,23 @@ index 6d3574013b2f..795b9a3488df 100644 list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); -@@ -4149,7 +4220,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) +@@ -3923,7 +4023,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) + int node; + struct kmem_cache_node *n; + +- flush_all(s); ++ flush_all_locked(s); + /* Attempt to free all objects */ + for_each_kmem_cache_node(s, node, n) { + free_partial(s, n); +@@ -4163,13 +4263,13 @@ int __kmem_cache_shrink(struct kmem_cache *s) + unsigned long flags; + int ret = 0; + +- flush_all(s); ++ flush_all_locked(s); + for_each_kmem_cache_node(s, node, n) { + INIT_LIST_HEAD(&discard); for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); @@ -23584,7 +19520,7 @@ index 6d3574013b2f..795b9a3488df 100644 /* * Build lists of slabs to discard or promote. -@@ -4180,7 +4251,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) +@@ -4200,7 +4300,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) list_splice(promote + i, &n->partial); @@ -23593,20 +19529,25 @@ index 6d3574013b2f..795b9a3488df 100644 /* Release empty slabs */ list_for_each_entry_safe(page, t, &discard, slab_list) -@@ -4355,6 +4426,12 @@ void __init kmem_cache_init(void) - { - static __initdata struct kmem_cache boot_kmem_cache, - boot_kmem_cache_node; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) { -+ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); -+ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); -+ } +@@ -4347,6 +4447,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) + int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; ++ LIST_HEAD(delayed_free); - if (debug_guardpage_minorder()) - slub_max_order = 0; -@@ -4542,7 +4619,7 @@ static int validate_slab_node(struct kmem_cache *s, + memcpy(s, static_cache, kmem_cache->object_size); + +@@ -4355,7 +4456,8 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) + * up. Even if it weren't true, IRQs are not up so we couldn't fire + * IPIs around. + */ +- __flush_cpu_slab(s, smp_processor_id()); ++ __flush_cpu_slab(s, smp_processor_id(), &delayed_free); ++ discard_delayed(&delayed_free); + for_each_kmem_cache_node(s, node, n) { + struct page *p; + +@@ -4562,7 +4664,7 @@ static int validate_slab_node(struct kmem_cache *s, struct page *page; unsigned long flags; @@ -23615,7 +19556,7 @@ index 6d3574013b2f..795b9a3488df 100644 list_for_each_entry(page, &n->partial, slab_list) { validate_slab(s, page); -@@ -4564,7 +4641,7 @@ static int validate_slab_node(struct kmem_cache *s, +@@ -4584,7 +4686,7 @@ static int validate_slab_node(struct kmem_cache *s, s->name, count, atomic_long_read(&n->nr_slabs)); out: @@ -23624,7 +19565,17 @@ index 6d3574013b2f..795b9a3488df 100644 return count; } -@@ -4743,12 +4820,12 @@ static int list_locations(struct kmem_cache *s, char *buf, +@@ -4635,6 +4737,9 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) + struct location *l; + int order; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && flags == GFP_ATOMIC) ++ return 0; ++ + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(flags, order); +@@ -4763,12 +4868,12 @@ static int list_locations(struct kmem_cache *s, char *buf, if (!atomic_long_read(&n->nr_slabs)) continue; @@ -23639,118 +19590,11 @@ index 6d3574013b2f..795b9a3488df 100644 } for (i = 0; i < t.count; i++) { -diff --git a/mm/swap.c b/mm/swap.c -index e7bdf094f76a..65ef7e3525bf 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -763,10 +763,20 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) - */ - void lru_add_drain_all(void) - { -- static seqcount_t seqcount = SEQCNT_ZERO(seqcount); -- static DEFINE_MUTEX(lock); -+ /* -+ * lru_drain_gen - Global pages generation number -+ * -+ * (A) Definition: global lru_drain_gen = x implies that all generations -+ * 0 < n <= x are already *scheduled* for draining. -+ * -+ * This is an optimization for the highly-contended use case where a -+ * user space workload keeps constantly generating a flow of pages for -+ * each CPU. -+ */ -+ static unsigned int lru_drain_gen; - static struct cpumask has_work; -- int cpu, seq; -+ static DEFINE_MUTEX(lock); -+ unsigned cpu, this_gen; - - /* - * Make sure nobody triggers this path before mm_percpu_wq is fully -@@ -775,21 +785,54 @@ void lru_add_drain_all(void) - if (WARN_ON(!mm_percpu_wq)) - return; - -- seq = raw_read_seqcount_latch(&seqcount); -+ /* -+ * Guarantee pagevec counter stores visible by this CPU are visible to -+ * other CPUs before loading the current drain generation. -+ */ -+ smp_mb(); -+ -+ /* -+ * (B) Locally cache global LRU draining generation number -+ * -+ * The read barrier ensures that the counter is loaded before the mutex -+ * is taken. It pairs with smp_mb() inside the mutex critical section -+ * at (D). -+ */ -+ this_gen = smp_load_acquire(&lru_drain_gen); - - mutex_lock(&lock); - - /* -- * Piggyback on drain started and finished while we waited for lock: -- * all pages pended at the time of our enter were drained from vectors. -+ * (C) Exit the draining operation if a newer generation, from another -+ * lru_add_drain_all(), was already scheduled for draining. Check (A). - */ -- if (__read_seqcount_retry(&seqcount, seq)) -+ if (unlikely(this_gen != lru_drain_gen)) - goto done; - -- raw_write_seqcount_latch(&seqcount); -+ /* -+ * (D) Increment global generation number -+ * -+ * Pairs with smp_load_acquire() at (B), outside of the critical -+ * section. Use a full memory barrier to guarantee that the new global -+ * drain generation number is stored before loading pagevec counters. -+ * -+ * This pairing must be done here, before the for_each_online_cpu loop -+ * below which drains the page vectors. -+ * -+ * Let x, y, and z represent some system CPU numbers, where x < y < z. -+ * Assume CPU #z is is in the middle of the for_each_online_cpu loop -+ * below and has already reached CPU #y's per-cpu data. CPU #x comes -+ * along, adds some pages to its per-cpu vectors, then calls -+ * lru_add_drain_all(). -+ * -+ * If the paired barrier is done at any later step, e.g. after the -+ * loop, CPU #x will just exit at (C) and miss flushing out all of its -+ * added pages. -+ */ -+ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); -+ smp_mb(); - - cpumask_clear(&has_work); -- - for_each_online_cpu(cpu) { - struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - -@@ -801,7 +844,7 @@ void lru_add_drain_all(void) - need_activate_page_drain(cpu)) { - INIT_WORK(work, lru_add_drain_per_cpu); - queue_work_on(cpu, mm_percpu_wq, work); -- cpumask_set_cpu(cpu, &has_work); -+ __cpumask_set_cpu(cpu, &has_work); - } - } - -@@ -816,7 +859,7 @@ void lru_add_drain_all(void) - { - lru_add_drain(); - } --#endif -+#endif /* CONFIG_SMP */ - - /** - * release_pages - batched put_page() diff --git a/mm/vmalloc.c b/mm/vmalloc.c -index be4724b916b3..994acb959d00 100644 +index e6f352bf0498..f5d554e0b083 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c -@@ -1544,7 +1544,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) +@@ -1558,7 +1558,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) struct vmap_block *vb; struct vmap_area *va; unsigned long vb_idx; @@ -23759,7 +19603,7 @@ index be4724b916b3..994acb959d00 100644 void *vaddr; node = numa_node_id(); -@@ -1581,11 +1581,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) +@@ -1595,11 +1595,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_PTR(err); } @@ -23774,7 +19618,7 @@ index be4724b916b3..994acb959d00 100644 return vaddr; } -@@ -1650,6 +1651,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +@@ -1664,6 +1665,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) struct vmap_block *vb; void *vaddr = NULL; unsigned int order; @@ -23782,7 +19626,7 @@ index be4724b916b3..994acb959d00 100644 BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); -@@ -1664,7 +1666,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +@@ -1678,7 +1680,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) order = get_order(size); rcu_read_lock(); @@ -23792,7 +19636,7 @@ index be4724b916b3..994acb959d00 100644 list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; -@@ -1687,7 +1690,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +@@ -1701,7 +1704,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) break; } @@ -23802,7 +19646,7 @@ index be4724b916b3..994acb959d00 100644 /* Allocate new block if nothing was found */ diff --git a/mm/vmstat.c b/mm/vmstat.c -index 4f7b4ee6aa12..0fc677378d3d 100644 +index f8942160fc95..920d88bf504a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -321,6 +321,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, @@ -23902,10 +19746,10 @@ index 4f7b4ee6aa12..0fc677378d3d 100644 void __dec_zone_page_state(struct page *page, enum zone_stat_item item) diff --git a/mm/workingset.c b/mm/workingset.c -index 92e66113a577..29609352552f 100644 +index 10e96de945b3..289c14d10351 100644 --- a/mm/workingset.c +++ b/mm/workingset.c -@@ -432,6 +432,8 @@ static struct list_lru shadow_nodes; +@@ -430,6 +430,8 @@ static struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { @@ -23914,7 +19758,7 @@ index 92e66113a577..29609352552f 100644 /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. -@@ -440,7 +442,8 @@ void workingset_update_node(struct xa_node *node) +@@ -438,7 +440,8 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ @@ -23924,8 +19768,70 @@ index 92e66113a577..29609352552f 100644 if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { +diff --git a/mm/z3fold.c b/mm/z3fold.c +index dacb0d70fa61..234b46f01e83 100644 +--- a/mm/z3fold.c ++++ b/mm/z3fold.c +@@ -1778,6 +1778,7 @@ static u64 z3fold_zpool_total_size(void *pool) + + static struct zpool_driver z3fold_zpool_driver = { + .type = "z3fold", ++ .sleep_mapped = true, + .owner = THIS_MODULE, + .create = z3fold_zpool_create, + .destroy = z3fold_zpool_destroy, +diff --git a/mm/zbud.c b/mm/zbud.c +index c49966ece674..7ec5f27a68b0 100644 +--- a/mm/zbud.c ++++ b/mm/zbud.c +@@ -203,6 +203,7 @@ static u64 zbud_zpool_total_size(void *pool) + + static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", ++ .sleep_mapped = true, + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, +diff --git a/mm/zpool.c b/mm/zpool.c +index 3744a2d1a624..5ed71207ced7 100644 +--- a/mm/zpool.c ++++ b/mm/zpool.c +@@ -23,6 +23,7 @@ struct zpool { + void *pool; + const struct zpool_ops *ops; + bool evictable; ++ bool can_sleep_mapped; + + struct list_head list; + }; +@@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, + zpool->pool = driver->create(name, gfp, ops, zpool); + zpool->ops = ops; + zpool->evictable = driver->shrink && ops && ops->evict; ++ zpool->can_sleep_mapped = driver->sleep_mapped; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); +@@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) + return zpool->evictable; + } + ++/** ++ * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. ++ * @zpool: The zpool to test ++ * ++ * Returns: true if zpool can sleep; false otherwise. ++ */ ++bool zpool_can_sleep_mapped(struct zpool *zpool) ++{ ++ return zpool->can_sleep_mapped; ++} ++ + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); + MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c -index c36fdff9a371..2cc22ee7b894 100644 +index 7289f502ffac..67b459609553 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -57,6 +57,7 @@ @@ -23961,11 +19867,11 @@ index c36fdff9a371..2cc22ee7b894 100644 }; struct mapping_area { -+ local_lock_t lock; - #ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING - struct vm_struct *vm; /* vm area for mapping object that span pages */ - #else -@@ -326,7 +342,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} ++ local_lock_t lock; + char *vm_buf; /* copy buffer for objects that span pages */ + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} static int create_cache(struct zs_pool *pool) { @@ -23974,7 +19880,7 @@ index c36fdff9a371..2cc22ee7b894 100644 0, 0, NULL); if (!pool->handle_cachep) return 1; -@@ -350,9 +366,26 @@ static void destroy_cache(struct zs_pool *pool) +@@ -346,9 +362,26 @@ static void destroy_cache(struct zs_pool *pool) static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { @@ -24003,7 +19909,7 @@ index c36fdff9a371..2cc22ee7b894 100644 static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { -@@ -372,12 +405,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) static void record_obj(unsigned long handle, unsigned long obj) { @@ -24022,7 +19928,7 @@ index c36fdff9a371..2cc22ee7b894 100644 } /* zpool driver */ -@@ -459,7 +498,10 @@ MODULE_ALIAS("zpool-zsmalloc"); +@@ -455,7 +494,10 @@ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ @@ -24034,7 +19940,7 @@ index c36fdff9a371..2cc22ee7b894 100644 static bool is_zspage_isolated(struct zspage *zspage) { -@@ -869,7 +911,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +@@ -862,7 +904,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) static unsigned long handle_to_obj(unsigned long handle) { @@ -24048,7 +19954,7 @@ index c36fdff9a371..2cc22ee7b894 100644 } static unsigned long obj_to_head(struct page *page, void *obj) -@@ -883,22 +931,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) +@@ -876,22 +924,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) static inline int testpin_tag(unsigned long handle) { @@ -24095,7 +20001,7 @@ index c36fdff9a371..2cc22ee7b894 100644 } static void reset_page(struct page *page) -@@ -1326,7 +1398,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, +@@ -1275,7 +1347,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, class = pool->size_class[class_idx]; off = (class->size * obj_idx) & ~PAGE_MASK; @@ -24105,7 +20011,7 @@ index c36fdff9a371..2cc22ee7b894 100644 area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ -@@ -1380,7 +1453,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) +@@ -1329,7 +1402,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } @@ -24115,122 +20021,133 @@ index c36fdff9a371..2cc22ee7b894 100644 migrate_read_unlock(zspage); unpin_tag(handle); diff --git a/mm/zswap.c b/mm/zswap.c -index fbb782924ccc..78a20f7b00f2 100644 +index 182f6ad5aa69..1566cc3ab7f4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c -@@ -18,6 +18,7 @@ - #include <linux/highmem.h> - #include <linux/slab.h> - #include <linux/spinlock.h> -+#include <linux/local_lock.h> - #include <linux/types.h> - #include <linux/atomic.h> - #include <linux/frontswap.h> -@@ -387,27 +388,35 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - /********************************* - * per-cpu code - **********************************/ --static DEFINE_PER_CPU(u8 *, zswap_dstmem); -+struct zswap_comp { -+ /* Used for per-CPU dstmem and tfm */ -+ local_lock_t lock; -+ u8 *dstmem; -+}; -+ -+static DEFINE_PER_CPU(struct zswap_comp, zswap_comp); +@@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; - static int zswap_dstmem_prepare(unsigned int cpu) - { -+ struct zswap_comp *zcomp; - u8 *dst; +- u8 *src; ++ u8 *src, *tmp = NULL; + unsigned int dlen; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; - dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) - return -ENOMEM; ++ if (!zpool_can_sleep_mapped(pool)) { ++ tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); ++ if (!tmp) ++ return -ENOMEM; ++ } ++ + /* extract swpentry from data */ + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + swpentry = zhdr->swpentry; /* here */ +@@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) + /* entry was invalidated */ + spin_unlock(&tree->lock); + zpool_unmap_handle(pool, handle); ++ kfree(tmp); + return 0; + } + spin_unlock(&tree->lock); +@@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) + dlen = PAGE_SIZE; + src = (u8 *)zhdr + sizeof(struct zswap_header); -- per_cpu(zswap_dstmem, cpu) = dst; -+ zcomp = per_cpu_ptr(&zswap_comp, cpu); -+ zcomp->dstmem = dst; - return 0; - } ++ if (!zpool_can_sleep_mapped(pool)) { ++ ++ memcpy(tmp, src, entry->length); ++ src = tmp; ++ ++ zpool_unmap_handle(pool, handle); ++ } ++ + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); +@@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) + spin_unlock(&tree->lock); - static int zswap_dstmem_dead(unsigned int cpu) - { -- u8 *dst; -+ struct zswap_comp *zcomp; + end: +- zpool_unmap_handle(pool, handle); ++ if (zpool_can_sleep_mapped(pool)) ++ zpool_unmap_handle(pool, handle); ++ else ++ kfree(tmp); ++ + return ret; + } -- dst = per_cpu(zswap_dstmem, cpu); -- kfree(dst); -- per_cpu(zswap_dstmem, cpu) = NULL; -+ zcomp = per_cpu_ptr(&zswap_comp, cpu); -+ kfree(zcomp->dstmem); -+ zcomp->dstmem = NULL; +@@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, + struct zswap_entry *entry; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; +- u8 *src, *dst; ++ u8 *src, *dst, *tmp; + unsigned int dlen; + int ret; - return 0; - } -@@ -919,10 +928,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) - dlen = PAGE_SIZE; - src = (u8 *)zhdr + sizeof(struct zswap_header); +@@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, dst = kmap_atomic(page); -- tfm = *get_cpu_ptr(entry->pool->tfm); -+ local_lock(&zswap_comp.lock); -+ tfm = *this_cpu_ptr(entry->pool->tfm); - ret = crypto_comp_decompress(tfm, src, entry->length, - dst, &dlen); -- put_cpu_ptr(entry->pool->tfm); -+ local_unlock(&zswap_comp.lock); + zswap_fill_page(dst, entry->value); kunmap_atomic(dst); - BUG_ON(ret); - BUG_ON(dlen != PAGE_SIZE); -@@ -1074,12 +1084,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ++ ret = 0; + goto freeentry; } - /* compress */ -- dst = get_cpu_var(zswap_dstmem); -- tfm = *get_cpu_ptr(entry->pool->tfm); -+ local_lock(&zswap_comp.lock); -+ dst = *this_cpu_ptr(&zswap_comp.dstmem); -+ tfm = *this_cpu_ptr(entry->pool->tfm); - src = kmap_atomic(page); - ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); - kunmap_atomic(src); -- put_cpu_ptr(entry->pool->tfm); - if (ret) { - ret = -EINVAL; - goto put_dstmem; -@@ -1103,7 +1113,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, - memcpy(buf, &zhdr, hlen); - memcpy(buf + hlen, dst, dlen); - zpool_unmap_handle(entry->pool->zpool, handle); -- put_cpu_var(zswap_dstmem); -+ local_unlock(&zswap_comp.lock); - - /* populate entry */ - entry->offset = offset; -@@ -1131,7 +1141,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, - return 0; - - put_dstmem: -- put_cpu_var(zswap_dstmem); -+ local_unlock(&zswap_comp.lock); - zswap_pool_put(entry->pool); - freepage: - zswap_entry_cache_free(entry); -@@ -1176,9 +1186,10 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, ++ if (!zpool_can_sleep_mapped(entry->pool->zpool)) { ++ ++ tmp = kmalloc(entry->length, GFP_ATOMIC); ++ if (!tmp) { ++ ret = -ENOMEM; ++ goto freeentry; ++ } ++ } ++ + /* decompress */ + dlen = PAGE_SIZE; + src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); if (zpool_evictable(entry->pool->zpool)) src += sizeof(struct zswap_header); - dst = kmap_atomic(page); -- tfm = *get_cpu_ptr(entry->pool->tfm); -+ local_lock(&zswap_comp.lock); -+ tfm = *this_cpu_ptr(entry->pool->tfm); - ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); -- put_cpu_ptr(entry->pool->tfm); -+ local_unlock(&zswap_comp.lock); - kunmap_atomic(dst); - zpool_unmap_handle(entry->pool->zpool, entry->handle); + ++ if (!zpool_can_sleep_mapped(entry->pool->zpool)) { ++ ++ memcpy(tmp, src, entry->length); ++ src = tmp; ++ ++ zpool_unmap_handle(entry->pool->zpool, entry->handle); ++ } ++ + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); +@@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + mutex_unlock(acomp_ctx->mutex); + +- zpool_unmap_handle(entry->pool->zpool, entry->handle); ++ if (zpool_can_sleep_mapped(entry->pool->zpool)) ++ zpool_unmap_handle(entry->pool->zpool, entry->handle); ++ else ++ kfree(tmp); ++ BUG_ON(ret); + + freeentry: +@@ -1279,7 +1320,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + +- return 0; ++ return ret; + } + + /* frees an entry in zswap */ diff --git a/net/Kconfig b/net/Kconfig -index 3831206977a1..81ae878ae553 100644 +index f4c32d982af6..a4b435f393b3 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -282,7 +282,7 @@ config CGROUP_NET_CLASSID @@ -24243,10 +20160,10 @@ index 3831206977a1..81ae878ae553 100644 config BQL bool diff --git a/net/core/dev.c b/net/core/dev.c -index 4906b44af850..3063e57529c6 100644 +index 449b45b843d4..d6456c255316 100644 --- a/net/core/dev.c +++ b/net/core/dev.c -@@ -219,14 +219,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +@@ -221,14 +221,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) static inline void rps_lock(struct softnet_data *sd) { #ifdef CONFIG_RPS @@ -24263,7 +20180,7 @@ index 4906b44af850..3063e57529c6 100644 #endif } -@@ -3034,6 +3034,7 @@ static void __netif_reschedule(struct Qdisc *q) +@@ -3041,6 +3041,7 @@ static void __netif_reschedule(struct Qdisc *q) sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); @@ -24271,7 +20188,7 @@ index 4906b44af850..3063e57529c6 100644 } void __netif_schedule(struct Qdisc *q) -@@ -3096,6 +3097,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +@@ -3103,6 +3104,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); @@ -24279,7 +20196,7 @@ index 4906b44af850..3063e57529c6 100644 } EXPORT_SYMBOL(__dev_kfree_skb_irq); -@@ -3762,7 +3764,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, +@@ -3775,7 +3777,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ @@ -24291,7 +20208,7 @@ index 4906b44af850..3063e57529c6 100644 if (unlikely(contended)) spin_lock(&q->busylock); -@@ -4558,6 +4564,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, +@@ -4570,6 +4576,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, rps_unlock(sd); local_irq_restore(flags); @@ -24299,7 +20216,7 @@ index 4906b44af850..3063e57529c6 100644 atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); -@@ -4773,7 +4780,7 @@ static int netif_rx_internal(struct sk_buff *skb) +@@ -4785,7 +4792,7 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -24308,7 +20225,7 @@ index 4906b44af850..3063e57529c6 100644 rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); -@@ -4783,14 +4790,14 @@ static int netif_rx_internal(struct sk_buff *skb) +@@ -4795,14 +4802,14 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); @@ -24326,7 +20243,7 @@ index 4906b44af850..3063e57529c6 100644 } return ret; } -@@ -4829,11 +4836,9 @@ int netif_rx_ni(struct sk_buff *skb) +@@ -4841,11 +4848,9 @@ int netif_rx_ni(struct sk_buff *skb) trace_netif_rx_ni_entry(skb); @@ -24340,7 +20257,7 @@ index 4906b44af850..3063e57529c6 100644 trace_netif_rx_ni_exit(err); return err; -@@ -6202,12 +6207,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) +@@ -6288,12 +6293,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) sd->rps_ipi_list = NULL; local_irq_enable(); @@ -24355,7 +20272,7 @@ index 4906b44af850..3063e57529c6 100644 } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) -@@ -6285,6 +6292,7 @@ void __napi_schedule(struct napi_struct *n) +@@ -6371,6 +6378,7 @@ void __napi_schedule(struct napi_struct *n) local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); @@ -24363,7 +20280,7 @@ index 4906b44af850..3063e57529c6 100644 } EXPORT_SYMBOL(__napi_schedule); -@@ -10711,6 +10719,7 @@ static int dev_cpu_dead(unsigned int oldcpu) +@@ -10938,6 +10946,7 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); @@ -24371,7 +20288,7 @@ index 4906b44af850..3063e57529c6 100644 #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; -@@ -10724,7 +10733,7 @@ static int dev_cpu_dead(unsigned int oldcpu) +@@ -10951,7 +10960,7 @@ static int dev_cpu_dead(unsigned int oldcpu) netif_rx_ni(skb); input_queue_head_incr(oldsd); } @@ -24380,7 +20297,7 @@ index 4906b44af850..3063e57529c6 100644 netif_rx_ni(skb); input_queue_head_incr(oldsd); } -@@ -11040,7 +11049,7 @@ static int __init net_dev_init(void) +@@ -11267,7 +11276,7 @@ static int __init net_dev_init(void) INIT_WORK(flush, flush_backlog); @@ -24390,7 +20307,7 @@ index 4906b44af850..3063e57529c6 100644 #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c -index 80dbf2f4016e..698b02dfeaaf 100644 +index 8e582e29a41e..e51f4854d8b2 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -42,7 +42,7 @@ @@ -24411,7 +20328,7 @@ index 80dbf2f4016e..698b02dfeaaf 100644 struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); -@@ -223,7 +223,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, +@@ -226,7 +226,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, @@ -24470,11 +20387,23 @@ index e491b083b348..ef432cea2e10 100644 struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index 785daff48030..e64d0a2e21c3 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -60,6 +60,7 @@ + #include <linux/prefetch.h> + #include <linux/if_vlan.h> + #include <linux/mpls.h> ++#include <linux/kcov.h> + + #include <net/protocol.h> + #include <net/dst.h> diff --git a/net/core/sock.c b/net/core/sock.c -index 6c5c6b18eff4..dc252f9aaf7e 100644 +index bbcd4b97eddd..dd69aa593639 100644 --- a/net/core/sock.c +++ b/net/core/sock.c -@@ -3049,12 +3049,11 @@ void lock_sock_nested(struct sock *sk, int subclass) +@@ -3050,12 +3050,11 @@ void lock_sock_nested(struct sock *sk, int subclass) if (sk->sk_lock.owned) __lock_sock(sk); sk->sk_lock.owned = 1; @@ -24488,7 +20417,7 @@ index 6c5c6b18eff4..dc252f9aaf7e 100644 } EXPORT_SYMBOL(lock_sock_nested); -@@ -3103,12 +3102,11 @@ bool lock_sock_fast(struct sock *sk) +@@ -3104,13 +3103,12 @@ bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) __lock_sock(sk); sk->sk_lock.owned = 1; @@ -24498,25 +20427,26 @@ index 6c5c6b18eff4..dc252f9aaf7e 100644 * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + __acquire(&sk->sk_lock.slock); - local_bh_enable(); return true; } EXPORT_SYMBOL(lock_sock_fast); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c -index 239e54474b65..fcb105cbb546 100644 +index 45fb450b4522..5fb95030e7c0 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c -@@ -585,7 +585,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) +@@ -635,7 +635,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { + local_bh_disable(); - inet_ehash_nolisten(sk, osk); + inet_ehash_nolisten(sk, osk, NULL); + local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); -@@ -617,11 +619,8 @@ int inet_hash(struct sock *sk) +@@ -667,11 +669,8 @@ int inet_hash(struct sock *sk) { int err = 0; @@ -24529,7 +20459,7 @@ index 239e54474b65..fcb105cbb546 100644 return err; } -@@ -632,17 +631,20 @@ void inet_unhash(struct sock *sk) +@@ -682,17 +681,20 @@ void inet_unhash(struct sock *sk) struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb = NULL; spinlock_t *lock; @@ -24552,7 +20482,7 @@ index 239e54474b65..fcb105cbb546 100644 if (sk_unhashed(sk)) goto unlock; -@@ -655,7 +657,10 @@ void inet_unhash(struct sock *sk) +@@ -705,7 +707,10 @@ void inet_unhash(struct sock *sk) __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); unlock: @@ -24565,10 +20495,10 @@ index 239e54474b65..fcb105cbb546 100644 EXPORT_SYMBOL_GPL(inet_unhash); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c -index 2d3add9e6116..50fd17cbf3ec 100644 +index 55c290d55605..9bad345cba9a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c -@@ -335,11 +335,8 @@ int inet6_hash(struct sock *sk) +@@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk) { int err = 0; @@ -24581,11 +20511,35 @@ index 2d3add9e6116..50fd17cbf3ec 100644 return err; } +diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c +index b31417f40bd5..39943c33abbf 100644 +--- a/net/mac80211/iface.c ++++ b/net/mac80211/iface.c +@@ -15,6 +15,7 @@ + #include <linux/if_arp.h> + #include <linux/netdevice.h> + #include <linux/rtnetlink.h> ++#include <linux/kcov.h> + #include <net/mac80211.h> + #include <net/ieee80211_radiotap.h> + #include "ieee80211_i.h" +diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c +index 972895e9f22d..3527b17f235a 100644 +--- a/net/mac80211/rx.c ++++ b/net/mac80211/rx.c +@@ -17,6 +17,7 @@ + #include <linux/etherdevice.h> + #include <linux/rcupdate.h> + #include <linux/export.h> ++#include <linux/kcov.h> + #include <linux/bitops.h> + #include <net/mac80211.h> + #include <net/ieee80211_radiotap.h> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c -index 2a76a2f5ed88..1542f1a5a31c 100644 +index 6fe4e5cc807c..880d109a1b2d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c -@@ -1257,7 +1257,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, +@@ -1258,7 +1258,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { @@ -24595,7 +20549,7 @@ index 2a76a2f5ed88..1542f1a5a31c 100644 err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c -index 54c417244642..7ce1abfd68a6 100644 +index 49eae93d1489..512a39d6edec 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -553,7 +553,11 @@ struct Qdisc noop_qdisc = { @@ -24610,7 +20564,7 @@ index 54c417244642..7ce1abfd68a6 100644 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, -@@ -858,9 +862,15 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, +@@ -845,9 +849,15 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); @@ -24627,7 +20581,7 @@ index 54c417244642..7ce1abfd68a6 100644 sch->ops = ops; sch->flags = ops->static_flags; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c -index 43cf8dbde898..d5516102491b 100644 +index dcc50ae54550..e4a0dc8f8e40 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -422,7 +422,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) @@ -24649,7 +20603,7 @@ index 43cf8dbde898..d5516102491b 100644 } EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c -index efc89a92961d..4e1216d04441 100644 +index d01ca1a18418..14059a9051b8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -44,7 +44,7 @@ static void xfrm_state_gc_task(struct work_struct *work); @@ -24673,7 +20627,7 @@ index efc89a92961d..4e1216d04441 100644 spin_lock_bh(&net->xfrm.xfrm_state_lock); write_seqcount_begin(&xfrm_state_hash_generation); -@@ -2589,6 +2594,8 @@ int __net_init xfrm_state_init(struct net *net) +@@ -2666,6 +2671,8 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_num = 0; INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); spin_lock_init(&net->xfrm.xfrm_state_lock); @@ -24682,200 +20636,6 @@ index efc89a92961d..4e1216d04441 100644 return 0; out_byspi: -diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py -index 2fa7bb83885f..a92c55bd8de5 100644 ---- a/scripts/gdb/linux/dmesg.py -+++ b/scripts/gdb/linux/dmesg.py -@@ -16,8 +16,13 @@ import sys - - from linux import utils - --printk_log_type = utils.CachedType("struct printk_log") -- -+printk_info_type = utils.CachedType("struct printk_info") -+prb_data_blk_lpos_type = utils.CachedType("struct prb_data_blk_lpos") -+prb_desc_type = utils.CachedType("struct prb_desc") -+prb_desc_ring_type = utils.CachedType("struct prb_desc_ring") -+prb_data_ring_type = utils.CachedType("struct prb_data_ring") -+printk_ringbuffer_type = utils.CachedType("struct printk_ringbuffer") -+atomic_long_type = utils.CachedType("atomic_long_t") - - class LxDmesg(gdb.Command): - """Print Linux kernel log buffer.""" -@@ -26,44 +31,110 @@ class LxDmesg(gdb.Command): - super(LxDmesg, self).__init__("lx-dmesg", gdb.COMMAND_DATA) - - def invoke(self, arg, from_tty): -- log_buf_addr = int(str(gdb.parse_and_eval( -- "(void *)'printk.c'::log_buf")).split()[0], 16) -- log_first_idx = int(gdb.parse_and_eval("'printk.c'::log_first_idx")) -- log_next_idx = int(gdb.parse_and_eval("'printk.c'::log_next_idx")) -- log_buf_len = int(gdb.parse_and_eval("'printk.c'::log_buf_len")) -- - inf = gdb.inferiors()[0] -- start = log_buf_addr + log_first_idx -- if log_first_idx < log_next_idx: -- log_buf_2nd_half = -1 -- length = log_next_idx - log_first_idx -- log_buf = utils.read_memoryview(inf, start, length).tobytes() -- else: -- log_buf_2nd_half = log_buf_len - log_first_idx -- a = utils.read_memoryview(inf, start, log_buf_2nd_half) -- b = utils.read_memoryview(inf, log_buf_addr, log_next_idx) -- log_buf = a.tobytes() + b.tobytes() -- -- length_offset = printk_log_type.get_type()['len'].bitpos // 8 -- text_len_offset = printk_log_type.get_type()['text_len'].bitpos // 8 -- time_stamp_offset = printk_log_type.get_type()['ts_nsec'].bitpos // 8 -- text_offset = printk_log_type.get_type().sizeof -- -- pos = 0 -- while pos < log_buf.__len__(): -- length = utils.read_u16(log_buf, pos + length_offset) -- if length == 0: -- if log_buf_2nd_half == -1: -- gdb.write("Corrupted log buffer!\n") -+ -+ # read in prb structure -+ prb_addr = int(str(gdb.parse_and_eval("(void *)'printk.c'::prb")).split()[0], 16) -+ sz = printk_ringbuffer_type.get_type().sizeof -+ prb = utils.read_memoryview(inf, prb_addr, sz).tobytes() -+ -+ # read in descriptor ring structure -+ off = printk_ringbuffer_type.get_type()['desc_ring'].bitpos // 8 -+ addr = prb_addr + off -+ sz = prb_desc_ring_type.get_type().sizeof -+ desc_ring = utils.read_memoryview(inf, addr, sz).tobytes() -+ -+ # read in descriptor array -+ off = prb_desc_ring_type.get_type()['count_bits'].bitpos // 8 -+ desc_ring_count = 1 << utils.read_u32(desc_ring, off) -+ desc_sz = prb_desc_type.get_type().sizeof -+ off = prb_desc_ring_type.get_type()['descs'].bitpos // 8 -+ addr = utils.read_ulong(desc_ring, off) -+ descs = utils.read_memoryview(inf, addr, desc_sz * desc_ring_count).tobytes() -+ -+ # read in info array -+ info_sz = printk_info_type.get_type().sizeof -+ off = prb_desc_ring_type.get_type()['infos'].bitpos // 8 -+ addr = utils.read_ulong(desc_ring, off) -+ infos = utils.read_memoryview(inf, addr, info_sz * desc_ring_count).tobytes() -+ -+ # read in text data ring structure -+ off = printk_ringbuffer_type.get_type()['text_data_ring'].bitpos // 8 -+ addr = prb_addr + off -+ sz = prb_data_ring_type.get_type().sizeof -+ text_data_ring = utils.read_memoryview(inf, addr, sz).tobytes() -+ -+ # read in text data -+ off = prb_data_ring_type.get_type()['size_bits'].bitpos // 8 -+ text_data_sz = 1 << utils.read_u32(text_data_ring, off) -+ off = prb_data_ring_type.get_type()['data'].bitpos // 8 -+ addr = utils.read_ulong(text_data_ring, off) -+ text_data = utils.read_memoryview(inf, addr, text_data_sz).tobytes() -+ -+ counter_off = atomic_long_type.get_type()['counter'].bitpos // 8 -+ -+ sv_off = prb_desc_type.get_type()['state_var'].bitpos // 8 -+ -+ off = prb_desc_type.get_type()['text_blk_lpos'].bitpos // 8 -+ begin_off = off + (prb_data_blk_lpos_type.get_type()['begin'].bitpos // 8) -+ next_off = off + (prb_data_blk_lpos_type.get_type()['next'].bitpos // 8) -+ -+ ts_off = printk_info_type.get_type()['ts_nsec'].bitpos // 8 -+ len_off = printk_info_type.get_type()['text_len'].bitpos // 8 -+ -+ # definitions from kernel/printk/printk_ringbuffer.h -+ desc_committed = 1 -+ desc_finalized = 2 -+ desc_sv_bits = utils.get_long_type().sizeof * 8 -+ desc_flags_shift = desc_sv_bits - 2 -+ desc_flags_mask = 3 << desc_flags_shift -+ desc_id_mask = ~desc_flags_mask -+ -+ # read in tail and head descriptor ids -+ off = prb_desc_ring_type.get_type()['tail_id'].bitpos // 8 -+ tail_id = utils.read_u64(desc_ring, off + counter_off) -+ off = prb_desc_ring_type.get_type()['head_id'].bitpos // 8 -+ head_id = utils.read_u64(desc_ring, off + counter_off) -+ -+ did = tail_id -+ while True: -+ ind = did % desc_ring_count -+ desc_off = desc_sz * ind -+ info_off = info_sz * ind -+ -+ # skip non-committed record -+ state = 3 & (utils.read_u64(descs, desc_off + sv_off + -+ counter_off) >> desc_flags_shift) -+ if state != desc_committed and state != desc_finalized: -+ if did == head_id: - break -- pos = log_buf_2nd_half -+ did = (did + 1) & desc_id_mask - continue - -- text_len = utils.read_u16(log_buf, pos + text_len_offset) -- text_start = pos + text_offset -- text = log_buf[text_start:text_start + text_len].decode( -- encoding='utf8', errors='replace') -- time_stamp = utils.read_u64(log_buf, pos + time_stamp_offset) -+ begin = utils.read_ulong(descs, desc_off + begin_off) % text_data_sz -+ end = utils.read_ulong(descs, desc_off + next_off) % text_data_sz -+ -+ # handle data-less record -+ if begin & 1 == 1: -+ text = "" -+ else: -+ # handle wrapping data block -+ if begin > end: -+ begin = 0 -+ -+ # skip over descriptor id -+ text_start = begin + utils.get_long_type().sizeof -+ -+ text_len = utils.read_u16(infos, info_off + len_off) -+ -+ # handle truncated message -+ if end - text_start < text_len: -+ text_len = end - text_start -+ -+ text = text_data[text_start:text_start + text_len].decode( -+ encoding='utf8', errors='replace') -+ -+ time_stamp = utils.read_u64(infos, info_off + ts_off) - - for line in text.splitlines(): - msg = u"[{time:12.6f}] {line}\n".format( -@@ -75,7 +146,9 @@ class LxDmesg(gdb.Command): - msg = msg.encode(encoding='utf8', errors='replace') - gdb.write(msg) - -- pos += length -+ if did == head_id: -+ break -+ did = (did + 1) & desc_id_mask - - - LxDmesg() -diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py -index ea94221dbd39..ff7c1799d588 100644 ---- a/scripts/gdb/linux/utils.py -+++ b/scripts/gdb/linux/utils.py -@@ -123,6 +123,13 @@ def read_u64(buffer, offset): - return read_u32(buffer, offset + 4) + (read_u32(buffer, offset) << 32) - - -+def read_ulong(buffer, offset): -+ if get_long_type().sizeof == 8: -+ return read_u64(buffer, offset) -+ else: -+ return read_u32(buffer, offset) -+ -+ - target_arch = None - - -- -2.28.0 +2.30.1 diff --git a/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-DTBS-v5.12.x.patch b/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-DTBS-v5.12.x.patch index 3e77de024..a1b2d7446 100644 --- a/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-DTBS-v5.12.x.patch +++ b/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-DTBS-v5.12.x.patch @@ -1,6 +1,6 @@ -From 70f279224356c652d4024010a1c7312094fd393a Mon Sep 17 00:00:00 2001 +From 1b1649f9a498cfb80ef7cda8e0c08edd1f1b9224 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Thu, 4 Mar 2021 10:44:43 -0600 +Date: Mon, 8 Mar 2021 12:05:04 -0600 Subject: [PATCH] Add BeagleBoard.org DTBS: v5.12.x https://github.com/beagleboard/BeagleBoard-DeviceTrees/tree/v5.12.x diff --git a/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch b/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch index e954f2bb2..680f89877 100644 --- a/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch +++ b/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch @@ -1,6 +1,6 @@ -From ff9110136d3bcdcff32055e24bd85679a97152c2 Mon Sep 17 00:00:00 2001 +From b1b70b85665c47c8402a7769556352e064c8ef19 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Thu, 4 Mar 2021 10:43:24 -0600 +Date: Mon, 8 Mar 2021 12:04:46 -0600 Subject: [PATCH] merge: wpanusb: https://github.com/statropy/wpanusb https://github.com/statropy/wpanusb/commit/7ba5f3d24d95f804e80b2d8d28e35b34c15219c2 diff --git a/version.sh b/version.sh index 9e54057a9..19f6b50f4 100644 --- a/version.sh +++ b/version.sh @@ -42,10 +42,10 @@ toolchain="gcc_arm_gnueabihf_10" #Kernel KERNEL_REL=5.12 -KERNEL_TAG=${KERNEL_REL}-rc1 +KERNEL_TAG=${KERNEL_REL}-rc2 kernel_rt=".x-rty" #Kernel Build -BUILD=${build_prefix}1 +BUILD=${build_prefix}1.1 #v5.X-rcX + upto SHA #prev_KERNEL_SHA="" -- GitLab