From ffcf8fde8a658d34c191ed225ee9e5af4b5d2917 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> Date: Tue, 25 Jan 2022 09:46:51 -0600 Subject: [PATCH] kernel v5.16.2 rebase with: device-tree/etc AUFS: https://github.com/sfjro/aufs5-standalone/commit/41291d0ae20e0a803a7d9c6cccd1bb3525f6439b BBDTBS: https://github.com/beagleboard/BeagleBoard-DeviceTrees/commit/6c03c4a088a5c84808617a169b598185dc975e0b TI_AMX3_CM3: http://git.ti.com/gitweb/?p=processor-firmware/ti-amx3-cm3-pm-firmware.git;a=commit;h=fb484c5e54f2e31cf0a338d2927a06a2870bcc2c WPANUSB: https://github.com/statropy/wpanusb/commit/251f0167545bf2dcaa3cad991a59dbf5ab05490a BCFSERIAL: https://github.com/statropy/bcfserial/commit/aded88429a8a00143596b41f4c1f50d9ae3d4069 WIRELESS_REGDB: https://git.kernel.org/pub/scm/linux/kernel/git/sforshee/wireless-regdb.git/commit/?id=2ce78ed90f71955f7b223c17b5cda6c8a7708efe Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- patch.sh | 22 +- patches/aufs/0001-merge-aufs-kbuild.patch | 6 +- patches/aufs/0002-merge-aufs-base.patch | 46 +- patches/aufs/0003-merge-aufs-mmap.patch | 58 +- patches/aufs/0004-merge-aufs-standalone.patch | 55 +- patches/aufs/0005-merge-aufs.patch | 41 +- ...-https-github.com-statropy-bcfserial.patch | 4 +- patches/defconfig | 19 +- patches/git/AUFS | 2 +- patches/ref_omap2plus_defconfig | 3 +- ...01-merge-CONFIG_PREEMPT_RT-Patch-Set.patch | 12331 ++++++++-------- ...-BeagleBoard.org-Device-Tree-Changes.patch | 4 +- ...eless-regdb-regulatory-database-file.patch | 4 +- ...sb-https-github.com-statropy-wpanusb.patch | 4 +- version.sh | 6 +- 15 files changed, 6242 insertions(+), 6363 deletions(-) diff --git a/patch.sh b/patch.sh index acd44c1ca..b45705de7 100644 --- a/patch.sh +++ b/patch.sh @@ -166,18 +166,18 @@ aufs () { ${git_bin} add . ${git_bin} commit -a -m 'merge: aufs' -m "https://github.com/sfjro/${aufs_prefix}standalone/commit/${aufs_hash}" -s - wget https://raw.githubusercontent.com/sfjro/${aufs_prefix}standalone/aufs${KERNEL_REL}/rt.patch - patch -p1 < rt.patch || aufs_fail - rm -rf rt.patch - ${git_bin} add . - ${git_bin} commit -a -m 'merge: aufs-rt' -s + #wget https://raw.githubusercontent.com/sfjro/${aufs_prefix}standalone/aufs${KERNEL_REL}/rt.patch + #patch -p1 < rt.patch || aufs_fail + #rm -rf rt.patch + #${git_bin} add . + #${git_bin} commit -a -m 'merge: aufs-rt' -s - ${git_bin} format-patch -6 -o ../patches/aufs/ + ${git_bin} format-patch -5 -o ../patches/aufs/ echo "AUFS: https://github.com/sfjro/${aufs_prefix}standalone/commit/${aufs_hash}" > ../patches/git/AUFS rm -rf ../${aufs_prefix}standalone/ || true - ${git_bin} reset --hard HEAD~6 + ${git_bin} reset --hard HEAD~5 start_cleanup @@ -186,10 +186,10 @@ aufs () { ${git} "${DIR}/patches/aufs/0003-merge-aufs-mmap.patch" ${git} "${DIR}/patches/aufs/0004-merge-aufs-standalone.patch" ${git} "${DIR}/patches/aufs/0005-merge-aufs.patch" - ${git} "${DIR}/patches/aufs/0006-merge-aufs-rt.patch" + #${git} "${DIR}/patches/aufs/0006-merge-aufs-rt.patch" wdir="aufs" - number=6 + number=5 cleanup fi @@ -480,7 +480,7 @@ local_patch () { } #external_git -#aufs +aufs wpanusb bcfserial #rt @@ -597,7 +597,7 @@ fixes packaging () { #do_backport="enable" if [ "x${do_backport}" = "xenable" ] ; then - backport_tag="v5.15.13" + backport_tag="v5.15.15" subsystem="bindeb-pkg" #regenerate="enable" diff --git a/patches/aufs/0001-merge-aufs-kbuild.patch b/patches/aufs/0001-merge-aufs-kbuild.patch index 852a1f7bb..ae7312dd5 100644 --- a/patches/aufs/0001-merge-aufs-kbuild.patch +++ b/patches/aufs/0001-merge-aufs-kbuild.patch @@ -1,7 +1,7 @@ -From 0912d4b0e400dd3059333e399a2eba7ad1d8421b Mon Sep 17 00:00:00 2001 +From e4b4265c962801701b45435be2602cd8371dc881 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Fri, 31 Dec 2021 10:14:07 -0600 -Subject: [PATCH 1/6] merge: aufs-kbuild +Date: Tue, 25 Jan 2022 09:29:34 -0600 +Subject: [PATCH 1/5] merge: aufs-kbuild Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- diff --git a/patches/aufs/0002-merge-aufs-base.patch b/patches/aufs/0002-merge-aufs-base.patch index 1c152c415..a760e260f 100644 --- a/patches/aufs/0002-merge-aufs-base.patch +++ b/patches/aufs/0002-merge-aufs-base.patch @@ -1,7 +1,7 @@ -From b736001ccc09929d1f2221394ad6543be0890362 Mon Sep 17 00:00:00 2001 +From c20b7877a692cb446f61bf3c0fdbbc4f92e82fc2 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Fri, 31 Dec 2021 10:14:07 -0600 -Subject: [PATCH 2/6] merge: aufs-base +Date: Tue, 25 Jan 2022 09:29:35 -0600 +Subject: [PATCH 2/5] merge: aufs-base Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- @@ -11,19 +11,18 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> fs/fcntl.c | 4 +++- fs/namespace.c | 6 ++++++ fs/splice.c | 10 +++++----- - fs/sync.c | 2 +- include/linux/fs.h | 4 ++++ include/linux/lockdep.h | 2 ++ include/linux/mnt_namespace.h | 3 +++ include/linux/splice.h | 6 ++++++ kernel/locking/lockdep.c | 3 ++- - 12 files changed, 64 insertions(+), 9 deletions(-) + 11 files changed, 63 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS -index 3b79fd441dde..3fb0a57f61b0 100644 +index dd36acc87ce6..0cfff91bfa5e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -3122,6 +3122,19 @@ F: include/uapi/linux/audit.h +@@ -3179,6 +3179,19 @@ F: include/uapi/linux/audit.h F: kernel/audit* F: lib/*audit.c @@ -44,10 +43,10 @@ index 3b79fd441dde..3fb0a57f61b0 100644 M: Miguel Ojeda <ojeda@kernel.org> S: Maintained diff --git a/drivers/block/loop.c b/drivers/block/loop.c -index c00ae30fde89..667e1307806f 100644 +index c3a36cfaa855..4bcea5a2fc25 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c -@@ -798,6 +798,24 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, +@@ -635,6 +635,24 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, return error; } @@ -108,7 +107,7 @@ index 9c6c6a3e2de5..02382fa9bd34 100644 return error; diff --git a/fs/namespace.c b/fs/namespace.c -index 659a8f39c61a..128367073782 100644 +index b696543adab8..c45740054bc7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -808,6 +808,12 @@ static inline int check_mnt(struct mount *mnt) @@ -152,24 +151,11 @@ index 5dbce4dcc1a7..3e6ba363b777 100644 { unsigned int p_space; int ret; -diff --git a/fs/sync.c b/fs/sync.c -index 1373a610dc78..b7b5a0a0df6f 100644 ---- a/fs/sync.c -+++ b/fs/sync.c -@@ -28,7 +28,7 @@ - * wait == 1 case since in that case write_inode() functions do - * sync_dirty_buffer() and thus effectively write one block at a time. - */ --static int __sync_filesystem(struct super_block *sb, int wait) -+int __sync_filesystem(struct super_block *sb, int wait) - { - if (wait) - sync_inodes_sb(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h -index 56eba723477e..e60d8ad85400 100644 +index bbf812ce89a8..ac21259fbeda 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h -@@ -1381,6 +1381,7 @@ extern void fasync_free(struct fasync_struct *); +@@ -1378,6 +1378,7 @@ extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); @@ -177,7 +163,7 @@ index 56eba723477e..e60d8ad85400 100644 extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); -@@ -2092,6 +2093,7 @@ struct file_operations { +@@ -2091,6 +2092,7 @@ struct file_operations { ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); @@ -185,7 +171,7 @@ index 56eba723477e..e60d8ad85400 100644 int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); -@@ -2615,6 +2617,7 @@ extern int current_umask(void); +@@ -2613,6 +2615,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); extern int generic_update_time(struct inode *, struct timespec64 *, int); @@ -193,7 +179,7 @@ index 56eba723477e..e60d8ad85400 100644 /* /sys/fs */ extern struct kobject *fs_kobj; -@@ -2778,6 +2781,7 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb) +@@ -2776,6 +2779,7 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb) } void emergency_thaw_all(void); @@ -202,7 +188,7 @@ index 56eba723477e..e60d8ad85400 100644 extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h -index 9fe165beb0f9..e47f7e15eeaf 100644 +index 467b94257105..dab20282c82e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -248,6 +248,8 @@ static inline int lockdep_match_key(struct lockdep_map *lock, @@ -250,7 +236,7 @@ index a55179fd60fc..8e21c53cf883 100644 + unsigned int flags); #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index d624231eab2b..eff2ca26e6c5 100644 +index 2270ec68f10a..fbb8e650c174 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -189,7 +189,7 @@ static diff --git a/patches/aufs/0003-merge-aufs-mmap.patch b/patches/aufs/0003-merge-aufs-mmap.patch index 3a1127481..0de213cf9 100644 --- a/patches/aufs/0003-merge-aufs-mmap.patch +++ b/patches/aufs/0003-merge-aufs-mmap.patch @@ -1,7 +1,7 @@ -From 866625ed09e77b9fc63f4c2c0fb56333408519ed Mon Sep 17 00:00:00 2001 +From c24226294a36a11a1c6a6ffba175ef89fce5cddd Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Fri, 31 Dec 2021 10:14:08 -0600 -Subject: [PATCH 3/6] merge: aufs-mmap +Date: Tue, 25 Jan 2022 09:29:35 -0600 +Subject: [PATCH 3/5] merge: aufs-mmap Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- @@ -21,10 +21,10 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> create mode 100644 mm/prfile.c diff --git a/fs/proc/base.c b/fs/proc/base.c -index 1f394095eb88..93f2479ef319 100644 +index 13eda8de2998..24fd5e986cb7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c -@@ -2189,7 +2189,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) +@@ -2191,7 +2191,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { @@ -50,7 +50,7 @@ index 13452b32e2bd..38acccfef9d4 100644 ino = inode->i_ino; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c -index cf25be3e0321..70af1c25ffa2 100644 +index ad667dbc96f5..d50423613c8e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -280,7 +280,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) @@ -65,7 +65,7 @@ index cf25be3e0321..70af1c25ffa2 100644 dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; -@@ -1865,7 +1868,7 @@ static int show_numa_map(struct seq_file *m, void *v) +@@ -1869,7 +1872,7 @@ static int show_numa_map(struct seq_file *m, void *v) struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; @@ -91,10 +91,10 @@ index a6d21fc0033c..02c2de31196e 100644 ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; diff --git a/include/linux/mm.h b/include/linux/mm.h -index 73a52aba448f..5dd42acf0707 100644 +index a7e4a9e7d807..30699240b45d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h -@@ -1806,6 +1806,28 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, +@@ -1926,6 +1926,28 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } @@ -124,10 +124,10 @@ index 73a52aba448f..5dd42acf0707 100644 void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 7f8ee09c711f..3a9a798a4ae1 100644 +index c3a6e6209600..45f5754d51cd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h -@@ -294,6 +294,7 @@ struct vm_region { +@@ -370,6 +370,7 @@ struct vm_region { unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ @@ -135,7 +135,7 @@ index 7f8ee09c711f..3a9a798a4ae1 100644 int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for -@@ -373,6 +374,7 @@ struct vm_area_struct { +@@ -449,6 +450,7 @@ struct vm_area_struct { unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units */ struct file * vm_file; /* File we map to (can be NULL). */ @@ -144,10 +144,10 @@ index 7f8ee09c711f..3a9a798a4ae1 100644 #ifdef CONFIG_SWAP diff --git a/kernel/fork.c b/kernel/fork.c -index 10885c649ca4..f3a9cd12011b 100644 +index 3244cc56b697..ae63cce182fd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -573,7 +573,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, +@@ -572,7 +572,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (file) { struct address_space *mapping = file->f_mapping; @@ -157,7 +157,7 @@ index 10885c649ca4..f3a9cd12011b 100644 if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); diff --git a/mm/Makefile b/mm/Makefile -index fc60a40ce954..c715b0138237 100644 +index d6c0042e3aa0..1f11c655a632 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ @@ -170,10 +170,10 @@ index fc60a40ce954..c715b0138237 100644 # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/filemap.c b/mm/filemap.c -index 82a17c35eb96..348e22067abd 100644 +index 39c4c46c6133..c9491cd04c85 100644 --- a/mm/filemap.c +++ b/mm/filemap.c -@@ -3349,7 +3349,7 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) +@@ -3356,7 +3356,7 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); @@ -183,7 +183,7 @@ index 82a17c35eb96..348e22067abd 100644 if (page->mapping != mapping) { unlock_page(page); diff --git a/mm/mmap.c b/mm/mmap.c -index 88dcc5c25225..6c276614ca96 100644 +index bfb0ea164a90..6bbc17d4733d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -183,7 +183,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) @@ -204,7 +204,7 @@ index 88dcc5c25225..6c276614ca96 100644 } if (next->anon_vma) anon_vma_merge(vma, next); -@@ -1873,7 +1873,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, +@@ -1872,7 +1872,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return addr; unmap_and_free_vma: @@ -213,7 +213,7 @@ index 88dcc5c25225..6c276614ca96 100644 vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ -@@ -2731,7 +2731,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +@@ -2730,7 +2730,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, goto out_free_mpol; if (new->vm_file) @@ -222,7 +222,7 @@ index 88dcc5c25225..6c276614ca96 100644 if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); -@@ -2750,7 +2750,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +@@ -2749,7 +2749,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); if (new->vm_file) @@ -231,7 +231,7 @@ index 88dcc5c25225..6c276614ca96 100644 unlink_anon_vmas(new); out_free_mpol: mpol_put(vma_policy(new)); -@@ -2945,7 +2945,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, +@@ -2944,7 +2944,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; @@ -240,7 +240,7 @@ index 88dcc5c25225..6c276614ca96 100644 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n", current->comm, current->pid); -@@ -3001,10 +3001,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, +@@ -3000,10 +3000,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; @@ -269,7 +269,7 @@ index 88dcc5c25225..6c276614ca96 100644 out: mmap_write_unlock(mm); if (populate) -@@ -3285,7 +3302,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, +@@ -3284,7 +3301,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) @@ -279,10 +279,10 @@ index 88dcc5c25225..6c276614ca96 100644 new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); diff --git a/mm/nommu.c b/mm/nommu.c -index 02d2427b8f9e..a7419302ab4e 100644 +index 55a9e48a7a02..8dc77ce96335 100644 --- a/mm/nommu.c +++ b/mm/nommu.c -@@ -523,7 +523,7 @@ static void __put_nommu_region(struct vm_region *region) +@@ -522,7 +522,7 @@ static void __put_nommu_region(struct vm_region *region) up_write(&nommu_region_sem); if (region->vm_file) @@ -291,7 +291,7 @@ index 02d2427b8f9e..a7419302ab4e 100644 /* IO memory and memory shared directly out of the pagecache * from ramfs/tmpfs mustn't be released here */ -@@ -655,7 +655,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +@@ -654,7 +654,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -300,7 +300,7 @@ index 02d2427b8f9e..a7419302ab4e 100644 put_nommu_region(vma->vm_region); vm_area_free(vma); } -@@ -1175,7 +1175,7 @@ unsigned long do_mmap(struct file *file, +@@ -1174,7 +1174,7 @@ unsigned long do_mmap(struct file *file, goto error_just_free; } } @@ -309,7 +309,7 @@ index 02d2427b8f9e..a7419302ab4e 100644 kmem_cache_free(vm_region_jar, region); region = pregion; result = start; -@@ -1252,10 +1252,10 @@ unsigned long do_mmap(struct file *file, +@@ -1251,10 +1251,10 @@ unsigned long do_mmap(struct file *file, up_write(&nommu_region_sem); error: if (region->vm_file) diff --git a/patches/aufs/0004-merge-aufs-standalone.patch b/patches/aufs/0004-merge-aufs-standalone.patch index c24430fcc..fefb19738 100644 --- a/patches/aufs/0004-merge-aufs-standalone.patch +++ b/patches/aufs/0004-merge-aufs-standalone.patch @@ -1,7 +1,7 @@ -From 5cac58abf4f5e33f7b0159eb3472b040273291df Mon Sep 17 00:00:00 2001 +From aba885d6b8de7c3944753990927cb4c2d9e3dfc1 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Fri, 31 Dec 2021 10:14:08 -0600 -Subject: [PATCH 4/6] merge: aufs-standalone +Date: Tue, 25 Jan 2022 09:29:35 -0600 +Subject: [PATCH 4/5] merge: aufs-standalone Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- @@ -14,12 +14,11 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> fs/open.c | 1 + fs/read_write.c | 2 ++ fs/splice.c | 2 ++ - fs/sync.c | 1 + fs/xattr.c | 1 + kernel/locking/lockdep.c | 1 + kernel/task_work.c | 1 + security/security.c | 8 ++++++++ - 14 files changed, 27 insertions(+) + 13 files changed, 26 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index bc5095b734f5..9508bd57a3bc 100644 @@ -42,7 +41,7 @@ index bc5095b734f5..9508bd57a3bc 100644 /** * d_ancestor - search for an ancestor diff --git a/fs/exec.c b/fs/exec.c -index ac7b51b51f38..52a8be4ebc1e 100644 +index 537d92c41105..0ab811e0fdaa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -111,6 +111,7 @@ bool path_noexec(const struct path *path) @@ -86,7 +85,7 @@ index 45437f8e1003..786af52904fc 100644 void __init files_init(void) { diff --git a/fs/namespace.c b/fs/namespace.c -index 128367073782..db9936562011 100644 +index c45740054bc7..d3d750635610 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -439,6 +439,7 @@ void __mnt_drop_write(struct vfsmount *mnt) @@ -114,7 +113,7 @@ index 128367073782..db9936562011 100644 static void lock_mnt_tree(struct mount *mnt) { diff --git a/fs/notify/group.c b/fs/notify/group.c -index fb89c351295d..460ad19c2570 100644 +index 6a297efc4788..ed394ccb10e0 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -100,6 +100,7 @@ void fsnotify_get_group(struct fsnotify_group *group) @@ -126,7 +125,7 @@ index fb89c351295d..460ad19c2570 100644 /* * Drop a reference to a group. Free it if it's through. diff --git a/fs/open.c b/fs/open.c -index e0df1536eb69..81b2d7c83add 100644 +index f732fb94600c..ca33d86e73fd 100644 --- a/fs/open.c +++ b/fs/open.c @@ -65,6 +65,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, @@ -138,10 +137,10 @@ index e0df1536eb69..81b2d7c83add 100644 long vfs_truncate(const struct path *path, loff_t length) { diff --git a/fs/read_write.c b/fs/read_write.c -index af057c57bdc6..76017f8331fb 100644 +index 0074afa7ecb3..612cf04d9f6b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c -@@ -492,6 +492,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) +@@ -488,6 +488,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) inc_syscr(current); return ret; } @@ -149,7 +148,7 @@ index af057c57bdc6..76017f8331fb 100644 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { -@@ -602,6 +603,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ +@@ -598,6 +599,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ file_end_write(file); return ret; } @@ -177,18 +176,6 @@ index 3e6ba363b777..7c1be373eb7c 100644 /** * splice_direct_to_actor - splices data directly between two non-pipes -diff --git a/fs/sync.c b/fs/sync.c -index b7b5a0a0df6f..fa5c7fba7f1b 100644 ---- a/fs/sync.c -+++ b/fs/sync.c -@@ -39,6 +39,7 @@ int __sync_filesystem(struct super_block *sb, int wait) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); - } -+EXPORT_SYMBOL_GPL(__sync_filesystem); - - /* - * Write out and wait upon all dirty data associated with this diff --git a/fs/xattr.c b/fs/xattr.c index 5c8c5175b385..ff7e9ff774b7 100644 --- a/fs/xattr.c @@ -202,7 +189,7 @@ index 5c8c5175b385..ff7e9ff774b7 100644 ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index eff2ca26e6c5..0627584f7872 100644 +index fbb8e650c174..49c4d11b0893 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -210,6 +210,7 @@ inline struct lock_class *lockdep_hlock_class(struct held_lock *hlock) @@ -223,10 +210,10 @@ index 1698fbe6f0e1..081b05acadf8 100644 } +EXPORT_SYMBOL_GPL(task_work_run); diff --git a/security/security.c b/security/security.c -index 67264cb08fb3..0d78065d71fe 100644 +index c88167a414b4..125724525d5c 100644 --- a/security/security.c +++ b/security/security.c -@@ -1147,6 +1147,7 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry) +@@ -1148,6 +1148,7 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry) return 0; return call_int_hook(path_rmdir, 0, dir, dentry); } @@ -234,7 +221,7 @@ index 67264cb08fb3..0d78065d71fe 100644 int security_path_unlink(const struct path *dir, struct dentry *dentry) { -@@ -1163,6 +1164,7 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry, +@@ -1164,6 +1165,7 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry, return 0; return call_int_hook(path_symlink, 0, dir, dentry, old_name); } @@ -242,7 +229,7 @@ index 67264cb08fb3..0d78065d71fe 100644 int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) -@@ -1171,6 +1173,7 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, +@@ -1172,6 +1174,7 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, return 0; return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); } @@ -250,7 +237,7 @@ index 67264cb08fb3..0d78065d71fe 100644 int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, -@@ -1198,6 +1201,7 @@ int security_path_truncate(const struct path *path) +@@ -1199,6 +1202,7 @@ int security_path_truncate(const struct path *path) return 0; return call_int_hook(path_truncate, 0, path); } @@ -258,7 +245,7 @@ index 67264cb08fb3..0d78065d71fe 100644 int security_path_chmod(const struct path *path, umode_t mode) { -@@ -1205,6 +1209,7 @@ int security_path_chmod(const struct path *path, umode_t mode) +@@ -1206,6 +1210,7 @@ int security_path_chmod(const struct path *path, umode_t mode) return 0; return call_int_hook(path_chmod, 0, path, mode); } @@ -266,7 +253,7 @@ index 67264cb08fb3..0d78065d71fe 100644 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { -@@ -1212,6 +1217,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) +@@ -1213,6 +1218,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) return 0; return call_int_hook(path_chown, 0, path, uid, gid); } @@ -274,7 +261,7 @@ index 67264cb08fb3..0d78065d71fe 100644 int security_path_chroot(const struct path *path) { -@@ -1312,6 +1318,7 @@ int security_inode_permission(struct inode *inode, int mask) +@@ -1313,6 +1319,7 @@ int security_inode_permission(struct inode *inode, int mask) return 0; return call_int_hook(inode_permission, 0, inode, mask); } @@ -282,7 +269,7 @@ index 67264cb08fb3..0d78065d71fe 100644 int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { -@@ -1509,6 +1516,7 @@ int security_file_permission(struct file *file, int mask) +@@ -1510,6 +1517,7 @@ int security_file_permission(struct file *file, int mask) return fsnotify_perm(file, mask); } diff --git a/patches/aufs/0005-merge-aufs.patch b/patches/aufs/0005-merge-aufs.patch index 2761d996d..0897846ba 100644 --- a/patches/aufs/0005-merge-aufs.patch +++ b/patches/aufs/0005-merge-aufs.patch @@ -1,9 +1,9 @@ -From c52a4ad22612ae69f204abee2e092cb0433ea4e2 Mon Sep 17 00:00:00 2001 +From ac9a8076d11453424fa21a10d845eee81cd00178 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Fri, 31 Dec 2021 10:14:09 -0600 -Subject: [PATCH 5/6] merge: aufs +Date: Tue, 25 Jan 2022 09:29:36 -0600 +Subject: [PATCH 5/5] merge: aufs -https://github.com/sfjro/aufs5-standalone/commit/fcc56866b84d43fd03d9e1d91d52f40e8a9d5335 +https://github.com/sfjro/aufs5-standalone/commit/41291d0ae20e0a803a7d9c6cccd1bb3525f6439b Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- Documentation/ABI/testing/debugfs-aufs | 55 + @@ -54,7 +54,7 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> fs/aufs/finfo.c | 149 ++ fs/aufs/fstype.h | 401 ++++ fs/aufs/hbl.h | 65 + - fs/aufs/hfsnotify.c | 288 +++ + fs/aufs/hfsnotify.c | 289 +++ fs/aufs/hfsplus.c | 60 + fs/aufs/hnotify.c | 715 ++++++ fs/aufs/i_op.c | 1513 +++++++++++++ @@ -98,7 +98,7 @@ Signed-off-by: Robert Nelson <robertcnelson@gmail.com> fs/aufs/xattr.c | 368 ++++ fs/aufs/xino.c | 1926 +++++++++++++++++ include/uapi/linux/aufs_type.h | 452 ++++ - 92 files changed, 37847 insertions(+) + 92 files changed, 37848 insertions(+) create mode 100644 Documentation/ABI/testing/debugfs-aufs create mode 100644 Documentation/ABI/testing/sysfs-aufs create mode 100644 Documentation/filesystems/aufs/README @@ -292,7 +292,7 @@ index 000000000000..48500c0569e6 + will be empty. About XINO files, see the aufs manual. diff --git a/Documentation/filesystems/aufs/README b/Documentation/filesystems/aufs/README new file mode 100644 -index 000000000000..d08a56444e22 +index 000000000000..9c05340b9dda --- /dev/null +++ b/Documentation/filesystems/aufs/README @@ -0,0 +1,396 @@ @@ -667,7 +667,7 @@ index 000000000000..d08a56444e22 +The Parted Magic Project made a donation (2013/9 and 11). +Pavel Barta made a donation (2013/10). +Nikolay Pertsev made a donation (2014/5). -+James B made a donation (2014/7 and 2015/7). ++James B made a donation (2014/7, 2015/7, and 2021/12). +Stefano Di Biase made a donation (2014/8). +Daniel Epellei made a donation (2015/1). +OmegaPhil made a donation (2016/1, 2018/4). @@ -16266,10 +16266,10 @@ index 000000000000..33b6f7da81eb +#endif /* __AUFS_HBL_H__ */ diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c new file mode 100644 -index 000000000000..b029fa2085a8 +index 000000000000..597d045a48b4 --- /dev/null +++ b/fs/aufs/hfsnotify.c -@@ -0,0 +1,288 @@ +@@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2005-2021 Junjiro R. Okajima @@ -16442,7 +16442,8 @@ index 000000000000..b029fa2085a8 + struct inode *h_dir, *h_inode; + struct fsnotify_mark *inode_mark; + -+ AuDebugOn(data_type != FSNOTIFY_EVENT_INODE); ++ AuDebugOn(!(data_type == FSNOTIFY_EVENT_INODE ++ || data_type == FSNOTIFY_EVENT_DENTRY)); + + err = 0; + /* if FS_UNMOUNT happens, there must be another bug */ @@ -28767,7 +28768,7 @@ index 000000000000..91e62d79e099 +} diff --git a/fs/aufs/super.c b/fs/aufs/super.c new file mode 100644 -index 000000000000..666e1837ffbb +index 000000000000..e67ffdb3797e --- /dev/null +++ b/fs/aufs/super.c @@ -0,0 +1,1050 @@ @@ -29226,7 +29227,7 @@ index 000000000000..666e1837ffbb + continue; + + h_sb = au_sbr_sb(sb, bindex); -+ e = vfsub_sync_filesystem(h_sb, wait); ++ e = vfsub_sync_filesystem(h_sb); + if (unlikely(e && !err)) + err = e; + /* go on even if an error happens */ @@ -32060,7 +32061,7 @@ index 000000000000..a3f709ee7475 +} diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c new file mode 100644 -index 000000000000..f6e27fbcf584 +index 000000000000..955c8cea646b --- /dev/null +++ b/fs/aufs/vfsub.c @@ -0,0 +1,919 @@ @@ -32103,13 +32104,13 @@ index 000000000000..f6e27fbcf584 +} +#endif + -+int vfsub_sync_filesystem(struct super_block *h_sb, int wait) ++int vfsub_sync_filesystem(struct super_block *h_sb) +{ + int err; + + lockdep_off(); + down_read(&h_sb->s_umount); -+ err = __sync_filesystem(h_sb, wait); ++ err = sync_filesystem(h_sb); + up_read(&h_sb->s_umount); + lockdep_on(); + @@ -32985,7 +32986,7 @@ index 000000000000..f6e27fbcf584 +} diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h new file mode 100644 -index 000000000000..0c76f421b95f +index 000000000000..dca38d84b626 --- /dev/null +++ b/fs/aufs/vfsub.h @@ -0,0 +1,358 @@ @@ -33075,7 +33076,7 @@ index 000000000000..0c76f421b95f +AuStubInt0(vfsub_test_mntns, struct vfsmount *mnt, struct super_block *h_sb); +#endif + -+int vfsub_sync_filesystem(struct super_block *h_sb, int wait); ++int vfsub_sync_filesystem(struct super_block *h_sb); + +/* ---------------------------------------------------------------------- */ + @@ -38135,7 +38136,7 @@ index 000000000000..e6683663885b +} diff --git a/include/uapi/linux/aufs_type.h b/include/uapi/linux/aufs_type.h new file mode 100644 -index 000000000000..5219a4d117b3 +index 000000000000..d4c1fabad7a7 --- /dev/null +++ b/include/uapi/linux/aufs_type.h @@ -0,0 +1,452 @@ @@ -38181,7 +38182,7 @@ index 000000000000..5219a4d117b3 +#include <limits.h> +#endif /* __KERNEL__ */ + -+#define AUFS_VERSION "5.15.5-20211129" ++#define AUFS_VERSION "5.16-20220117" + +/* todo? move this to linux-2.6.19/include/magic.h */ +#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') diff --git a/patches/bcfserial/0001-merge-bcfserial-https-github.com-statropy-bcfserial.patch b/patches/bcfserial/0001-merge-bcfserial-https-github.com-statropy-bcfserial.patch index 875a31c98..c99bfb8d2 100644 --- a/patches/bcfserial/0001-merge-bcfserial-https-github.com-statropy-bcfserial.patch +++ b/patches/bcfserial/0001-merge-bcfserial-https-github.com-statropy-bcfserial.patch @@ -1,6 +1,6 @@ -From e4502c2b1db738244393c92dcede021a504bdac1 Mon Sep 17 00:00:00 2001 +From 48ea340f91a8c31ae5601a1e31d0ed8c2f711084 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Mon, 10 Jan 2022 15:16:45 -0600 +Date: Tue, 25 Jan 2022 09:30:40 -0600 Subject: [PATCH] merge: bcfserial: https://github.com/statropy/bcfserial https://github.com/statropy/bcfserial/commit/aded88429a8a00143596b41f4c1f50d9ae3d4069 diff --git a/patches/defconfig b/patches/defconfig index 6eed1a195..716042520 100644 --- a/patches/defconfig +++ b/patches/defconfig @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 5.16.0 Kernel Configuration +# Linux/arm 5.16.2 Kernel Configuration # CONFIG_CC_VERSION_TEXT="arm-linux-gnueabi-gcc (GCC) 11.1.0" CONFIG_CC_IS_GCC=y @@ -7163,6 +7163,23 @@ CONFIG_ROMFS_ON_MTD=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set # CONFIG_EROFS_FS is not set +CONFIG_AUFS_FS=m +CONFIG_AUFS_BRANCH_MAX_127=y +# CONFIG_AUFS_BRANCH_MAX_511 is not set +# CONFIG_AUFS_BRANCH_MAX_1023 is not set +# CONFIG_AUFS_BRANCH_MAX_32767 is not set +CONFIG_AUFS_SBILIST=y +# CONFIG_AUFS_HNOTIFY is not set +CONFIG_AUFS_EXPORT=y +CONFIG_AUFS_XATTR=y +# CONFIG_AUFS_FHSM is not set +# CONFIG_AUFS_RDU is not set +# CONFIG_AUFS_DIRREN is not set +# CONFIG_AUFS_SHWH is not set +# CONFIG_AUFS_BR_RAMFS is not set +# CONFIG_AUFS_BR_FUSE is not set +CONFIG_AUFS_BDEV_LOOP=y +# CONFIG_AUFS_DEBUG is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V2=y diff --git a/patches/git/AUFS b/patches/git/AUFS index f4a5297b6..3f4560b27 100644 --- a/patches/git/AUFS +++ b/patches/git/AUFS @@ -1 +1 @@ -AUFS: https://github.com/sfjro/aufs5-standalone/commit/03a3ee3199ae847b9e5ac7596900c64fc17cfd5a +AUFS: https://github.com/sfjro/aufs5-standalone/commit/41291d0ae20e0a803a7d9c6cccd1bb3525f6439b diff --git a/patches/ref_omap2plus_defconfig b/patches/ref_omap2plus_defconfig index 8a2149ab2..cf18ac017 100644 --- a/patches/ref_omap2plus_defconfig +++ b/patches/ref_omap2plus_defconfig @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 5.16.0 Kernel Configuration +# Linux/arm 5.16.2 Kernel Configuration # CONFIG_CC_VERSION_TEXT="arm-linux-gnueabi-gcc (GCC) 11.1.0" CONFIG_CC_IS_GCC=y @@ -6565,6 +6565,7 @@ CONFIG_CRAMFS_BLOCKDEV=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set # CONFIG_EROFS_FS is not set +# CONFIG_AUFS_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V2=y diff --git a/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch b/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch index fa54e4183..e30ec554b 100644 --- a/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch +++ b/patches/rt/0001-merge-CONFIG_PREEMPT_RT-Patch-Set.patch @@ -1,285 +1,216 @@ -From ad6fffd861ddb8f59a92bef3121b4aacc3363656 Mon Sep 17 00:00:00 2001 +From 612e116498198779d63b7bdbb1d7f94c8aae9b1b Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Fri, 31 Dec 2021 10:15:06 -0600 +Date: Tue, 25 Jan 2022 09:30:58 -0600 Subject: [PATCH] merge: CONFIG_PREEMPT_RT Patch Set -patch-5.15.10-rt24.patch.xz +patch-5.16.2-rt19.patch.xz Signed-off-by: Robert Nelson <robertcnelson@gmail.com> --- - Documentation/dev-tools/kcov.rst | 5 + - arch/alpha/include/asm/spinlock_types.h | 2 +- - arch/arm/Kconfig | 5 +- - arch/arm/include/asm/spinlock_types.h | 2 +- - arch/arm/include/asm/thread_info.h | 6 +- - arch/arm/kernel/asm-offsets.c | 1 + - arch/arm/kernel/entry-armv.S | 19 +- - arch/arm/kernel/signal.c | 3 +- - arch/arm/kernel/smp.c | 2 - - arch/arm/mm/fault.c | 6 + - arch/arm64/Kconfig | 3 + - arch/arm64/include/asm/pgtable.h | 2 +- - arch/arm64/include/asm/preempt.h | 25 +- - arch/arm64/include/asm/signal.h | 4 + - arch/arm64/include/asm/spinlock_types.h | 2 +- - arch/arm64/include/asm/thread_info.h | 8 +- - arch/arm64/kernel/asm-offsets.c | 1 + - arch/arm64/kernel/fpsimd.c | 23 +- - arch/arm64/kernel/signal.c | 10 +- - arch/arm64/kvm/arm.c | 6 +- - arch/csky/include/asm/spinlock_types.h | 2 +- - arch/hexagon/include/asm/spinlock_types.h | 2 +- - arch/ia64/include/asm/spinlock_types.h | 2 +- - arch/powerpc/Kconfig | 3 + - .../include/asm/simple_spinlock_types.h | 2 +- - arch/powerpc/include/asm/smp.h | 1 + - arch/powerpc/include/asm/spinlock_types.h | 2 +- - arch/powerpc/include/asm/stackprotector.h | 4 + - arch/powerpc/include/asm/thread_info.h | 7 + - arch/powerpc/kernel/interrupt.c | 8 +- - arch/powerpc/kernel/irq.c | 4 + - arch/powerpc/kernel/kgdb.c | 10 +- - arch/powerpc/kernel/smp.c | 5 + - arch/powerpc/kernel/traps.c | 7 +- - arch/powerpc/kexec/crash.c | 3 - - arch/powerpc/kvm/Kconfig | 1 + - arch/powerpc/platforms/pseries/iommu.c | 31 +- - arch/riscv/include/asm/spinlock_types.h | 2 +- - arch/s390/include/asm/spinlock_types.h | 2 +- - arch/sh/include/asm/spinlock_types.h | 2 +- - arch/sh/kernel/irq.c | 2 + - arch/sparc/kernel/irq_64.c | 2 + - arch/x86/Kconfig | 2 + - arch/x86/include/asm/irq_stack.h | 3 + - arch/x86/include/asm/preempt.h | 33 +- - arch/x86/include/asm/signal.h | 13 + - arch/x86/include/asm/stackprotector.h | 8 +- - arch/x86/include/asm/thread_info.h | 5 + - arch/x86/kernel/cpu/mshyperv.c | 3 +- - arch/x86/kernel/dumpstack_32.c | 2 +- - arch/x86/kernel/dumpstack_64.c | 3 +- - arch/x86/kernel/i8259.c | 3 +- - arch/x86/kernel/irq_32.c | 2 + - arch/x86/kernel/kgdb.c | 9 +- - arch/x86/kernel/unwind_frame.c | 16 +- - arch/x86/kernel/unwind_orc.c | 2 +- - arch/x86/kvm/x86.c | 8 + - arch/xtensa/include/asm/spinlock_types.h | 2 +- - block/blk-mq.c | 6 +- - crypto/cryptd.c | 19 +- - crypto/testmgr.c | 4 +- - drivers/block/zram/zram_drv.c | 36 + - drivers/block/zram/zram_drv.h | 1 + - drivers/char/random.c | 16 +- - drivers/char/tpm/tpm_tis.c | 29 +- - drivers/firmware/efi/efi.c | 5 +- - drivers/gpu/drm/i915/display/intel_crtc.c | 15 +- - drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 5 +- - drivers/gpu/drm/i915/gt/intel_context.h | 3 +- - drivers/gpu/drm/i915/gt/intel_context_types.h | 1 + - drivers/gpu/drm/i915/gt/intel_engine_pm.c | 38 +- - .../drm/i915/gt/intel_execlists_submission.c | 17 +- - drivers/gpu/drm/i915/i915_irq.c | 6 +- - drivers/gpu/drm/i915/i915_request.c | 2 - - drivers/gpu/drm/i915/i915_request.h | 3 +- - drivers/gpu/drm/i915/i915_trace.h | 6 +- - drivers/gpu/drm/i915/i915_utils.h | 2 +- - drivers/hv/hyperv_vmbus.h | 1 + - drivers/hv/vmbus_drv.c | 5 +- - drivers/leds/trigger/Kconfig | 1 + - drivers/md/raid5.c | 7 +- - drivers/md/raid5.h | 1 + - .../net/ethernet/netronome/nfp/abm/qdisc.c | 2 +- - drivers/scsi/fcoe/fcoe.c | 16 +- - drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- - drivers/scsi/libfc/fc_exch.c | 4 +- - drivers/tty/serial/8250/8250.h | 47 +- - drivers/tty/serial/8250/8250_core.c | 17 +- - drivers/tty/serial/8250/8250_fsl.c | 9 + - drivers/tty/serial/8250/8250_ingenic.c | 7 + - drivers/tty/serial/8250/8250_mtk.c | 29 +- - drivers/tty/serial/8250/8250_port.c | 92 +- - drivers/tty/serial/amba-pl011.c | 17 +- - drivers/tty/serial/omap-serial.c | 12 +- - drivers/virt/acrn/irqfd.c | 1 - - fs/afs/dir_silly.c | 2 +- - fs/cifs/readdir.c | 2 +- - fs/dcache.c | 37 +- - fs/fscache/internal.h | 1 - - fs/fscache/main.c | 6 - - fs/fscache/object.c | 13 +- - fs/fuse/readdir.c | 2 +- - fs/namei.c | 4 +- - fs/namespace.c | 20 +- - fs/nfs/dir.c | 4 +- - fs/nfs/unlink.c | 4 +- - fs/proc/base.c | 3 +- - fs/proc/proc_sysctl.c | 2 +- - include/asm-generic/softirq_stack.h | 2 +- - include/linux/console.h | 19 + - include/linux/dcache.h | 4 +- - include/linux/entry-common.h | 8 +- - include/linux/irq_work.h | 8 + - include/linux/irqdesc.h | 1 + - include/linux/irqflags.h | 23 +- - include/linux/kernel.h | 13 +- - include/linux/kgdb.h | 3 + - include/linux/mm_types.h | 4 + - include/linux/netdevice.h | 4 - - include/linux/nfs_xdr.h | 2 +- - include/linux/preempt.h | 68 +- - include/linux/printk.h | 88 +- - include/linux/random.h | 2 +- - include/linux/ratelimit_types.h | 2 +- - include/linux/rcupdate.h | 7 + - include/linux/rtmutex.h | 9 + - include/linux/sched.h | 169 ++- - include/linux/sched/mm.h | 20 + - include/linux/serial_8250.h | 5 + - include/linux/skbuff.h | 7 + - include/linux/smp.h | 3 + - include/linux/spinlock_types_up.h | 2 +- - include/linux/suspend.h | 10 +- - include/linux/thread_info.h | 12 +- - include/linux/trace_events.h | 5 +- - include/linux/u64_stats_sync.h | 52 +- - include/net/act_api.h | 10 +- - include/net/gen_stats.h | 59 +- - include/net/netfilter/xt_rateest.h | 2 +- - include/net/pkt_cls.h | 4 +- - include/net/sch_generic.h | 78 +- - init/Kconfig | 5 +- - init/main.c | 1 + - kernel/Kconfig.preempt | 6 + - kernel/cgroup/rstat.c | 5 +- - kernel/debug/debug_core.c | 45 +- - kernel/debug/kdb/kdb_io.c | 18 +- - kernel/entry/common.c | 12 +- - kernel/exit.c | 7 + - kernel/fork.c | 18 +- - kernel/irq/handle.c | 10 +- - kernel/irq/manage.c | 12 +- - kernel/irq/spurious.c | 8 + - kernel/irq_work.c | 130 ++- - kernel/kcov.c | 36 +- - kernel/kprobes.c | 8 +- - kernel/ksysfs.c | 12 + - kernel/kthread.c | 16 +- - kernel/locking/lockdep.c | 2 + - kernel/locking/rtmutex.c | 20 +- - kernel/locking/rtmutex_api.c | 30 +- - kernel/locking/spinlock_rt.c | 23 +- - kernel/panic.c | 30 +- - kernel/power/main.c | 10 +- - kernel/printk/Makefile | 1 - - kernel/printk/internal.h | 36 - - kernel/printk/printk.c | 969 +++++++++--------- - kernel/printk/printk_safe.c | 52 - - kernel/ptrace.c | 38 +- - kernel/rcu/tasks.h | 9 +- - kernel/rcu/tree.c | 7 +- - kernel/sched/core.c | 180 +++- - kernel/sched/deadline.c | 2 +- - kernel/sched/fair.c | 21 +- - kernel/sched/features.h | 8 + - kernel/sched/psi.c | 14 +- - kernel/sched/rt.c | 2 +- - kernel/sched/sched.h | 9 + - kernel/sched/swait.c | 1 + - kernel/sched/topology.c | 2 +- - kernel/signal.c | 36 +- - kernel/smp.c | 14 +- - kernel/time/clockevents.c | 9 +- - kernel/time/ntp.c | 14 +- - kernel/time/timekeeping.c | 30 +- - kernel/time/timekeeping_debug.c | 2 +- - kernel/trace/trace.c | 46 +- - kernel/trace/trace_events.c | 1 + - kernel/trace/trace_output.c | 14 +- - kernel/workqueue.c | 4 - - lib/bug.c | 1 + - lib/dump_stack.c | 4 +- - lib/irq_poll.c | 2 + - lib/locking-selftest.c | 170 ++- - lib/nmi_backtrace.c | 4 +- - lib/ratelimit.c | 4 +- - lib/scatterlist.c | 11 +- - mm/Kconfig | 2 +- - mm/memory.c | 2 +- - mm/page_alloc.c | 4 +- - mm/vmalloc.c | 10 +- - mm/workingset.c | 5 +- - mm/zsmalloc.c | 84 +- - net/Kconfig | 2 +- - net/core/dev.c | 33 +- - net/core/gen_estimator.c | 52 +- - net/core/gen_stats.c | 186 ++-- - net/ipv4/inet_hashtables.c | 19 +- - net/ipv6/inet6_hashtables.c | 5 +- - net/netfilter/xt_RATEEST.c | 7 +- - net/sched/act_api.c | 21 +- - net/sched/act_bpf.c | 2 +- - net/sched/act_ife.c | 4 +- - net/sched/act_mpls.c | 2 +- - net/sched/act_police.c | 4 +- - net/sched/act_sample.c | 2 +- - net/sched/act_simple.c | 3 +- - net/sched/act_skbedit.c | 2 +- - net/sched/act_skbmod.c | 2 +- - net/sched/sch_api.c | 18 +- - net/sched/sch_atm.c | 6 +- - net/sched/sch_cbq.c | 15 +- - net/sched/sch_drr.c | 13 +- - net/sched/sch_ets.c | 17 +- - net/sched/sch_generic.c | 13 +- - net/sched/sch_gred.c | 65 +- - net/sched/sch_hfsc.c | 11 +- - net/sched/sch_htb.c | 43 +- - net/sched/sch_mq.c | 30 +- - net/sched/sch_mqprio.c | 63 +- - net/sched/sch_multiq.c | 3 +- - net/sched/sch_prio.c | 4 +- - net/sched/sch_qfq.c | 13 +- - net/sched/sch_taprio.c | 2 +- - net/sunrpc/svc_xprt.c | 4 +- - samples/kfifo/bytestream-example.c | 12 +- - samples/kfifo/inttype-example.c | 12 +- - samples/kfifo/record-example.c | 12 +- - security/smack/smack_lsm.c | 9 +- - sound/soc/mediatek/common/mtk-afe-fe-dai.c | 1 - - 240 files changed, 2915 insertions(+), 1729 deletions(-) - delete mode 100644 kernel/printk/printk_safe.c + .../admin-guide/cgroup-v1/memory.rst | 2 + + arch/alpha/include/asm/spinlock_types.h | 2 +- + arch/arm/Kconfig | 5 +- + arch/arm/include/asm/spinlock_types.h | 2 +- + arch/arm/include/asm/thread_info.h | 6 +- + arch/arm/kernel/asm-offsets.c | 1 + + arch/arm/kernel/entry-armv.S | 19 +- + arch/arm/kernel/signal.c | 3 +- + arch/arm/mm/fault.c | 6 + + arch/arm64/Kconfig | 2 + + arch/arm64/include/asm/pgtable.h | 2 +- + arch/arm64/include/asm/preempt.h | 25 +- + arch/arm64/include/asm/signal.h | 4 + + arch/arm64/include/asm/spinlock_types.h | 2 +- + arch/arm64/include/asm/thread_info.h | 8 +- + arch/arm64/kernel/asm-offsets.c | 1 + + arch/arm64/kernel/fpsimd.c | 25 +- + arch/arm64/kernel/signal.c | 10 +- + arch/arm64/kvm/arm.c | 6 +- + arch/csky/include/asm/spinlock_types.h | 2 +- + arch/hexagon/include/asm/spinlock_types.h | 2 +- + arch/ia64/include/asm/spinlock_types.h | 2 +- + arch/ia64/include/asm/thread_info.h | 6 +- + arch/powerpc/Kconfig | 3 + + .../include/asm/simple_spinlock_types.h | 2 +- + arch/powerpc/include/asm/spinlock_types.h | 2 +- + arch/powerpc/include/asm/stackprotector.h | 4 + + arch/powerpc/include/asm/thread_info.h | 8 + + arch/powerpc/kernel/interrupt.c | 8 +- + arch/powerpc/kernel/irq.c | 4 + + arch/powerpc/kernel/traps.c | 7 +- + arch/powerpc/kvm/Kconfig | 1 + + arch/powerpc/platforms/pseries/iommu.c | 31 +- + arch/riscv/include/asm/spinlock_types.h | 2 +- + arch/s390/include/asm/spinlock_types.h | 2 +- + arch/sh/include/asm/spinlock_types.h | 2 +- + arch/sh/kernel/irq.c | 2 + + arch/sparc/kernel/irq_64.c | 2 + + arch/x86/Kconfig | 2 + + arch/x86/include/asm/pgtable.h | 1 + + arch/x86/include/asm/preempt.h | 33 +- + arch/x86/include/asm/signal.h | 13 + + arch/x86/include/asm/thread_info.h | 5 + + arch/x86/kernel/cpu/mshyperv.c | 2 +- + arch/x86/kvm/x86.c | 6 + + arch/xtensa/include/asm/spinlock_types.h | 2 +- + block/blk-mq.c | 6 +- + crypto/cryptd.c | 19 +- + drivers/block/zram/zram_drv.c | 36 + + drivers/block/zram/zram_drv.h | 1 + + drivers/char/random.c | 91 +- + drivers/char/tpm/tpm_tis.c | 29 +- + drivers/gpu/drm/i915/display/intel_crtc.c | 15 +- + drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 5 +- + drivers/gpu/drm/i915/gt/intel_context.h | 3 +- + drivers/gpu/drm/i915/gt/intel_context_types.h | 1 + + drivers/gpu/drm/i915/gt/intel_engine_pm.c | 38 +- + .../drm/i915/gt/intel_execlists_submission.c | 17 +- + drivers/gpu/drm/i915/i915_irq.c | 6 +- + drivers/gpu/drm/i915/i915_request.c | 2 - + drivers/gpu/drm/i915/i915_request.h | 3 +- + drivers/gpu/drm/i915/i915_trace.h | 6 +- + drivers/gpu/drm/i915/i915_utils.h | 2 +- + drivers/hv/vmbus_drv.c | 2 +- + drivers/i2c/busses/i2c-cht-wc.c | 11 +- + drivers/i2c/i2c-core-base.c | 2 +- + drivers/md/raid5.c | 7 +- + drivers/md/raid5.h | 1 + + drivers/mfd/ezx-pcap.c | 4 +- + drivers/misc/hi6421v600-irq.c | 6 +- + drivers/mmc/core/block.c | 22 +- + drivers/net/usb/lan78xx.c | 7 +- + drivers/scsi/fcoe/fcoe.c | 16 +- + drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- + drivers/scsi/libfc/fc_exch.c | 4 +- + drivers/staging/greybus/gpio.c | 5 +- + drivers/tty/serial/8250/8250.h | 47 +- + drivers/tty/serial/8250/8250_core.c | 17 +- + drivers/tty/serial/8250/8250_fsl.c | 9 + + drivers/tty/serial/8250/8250_ingenic.c | 7 + + drivers/tty/serial/8250/8250_mtk.c | 29 +- + drivers/tty/serial/8250/8250_port.c | 92 +- + drivers/tty/serial/8250/Kconfig | 1 + + drivers/tty/serial/amba-pl011.c | 17 +- + drivers/tty/serial/omap-serial.c | 12 +- + drivers/tty/vt/vt.c | 2 +- + drivers/virt/acrn/irqfd.c | 1 - + fs/afs/dir_silly.c | 2 +- + fs/cifs/readdir.c | 2 +- + fs/dcache.c | 37 +- + fs/fscache/internal.h | 1 - + fs/fscache/main.c | 6 - + fs/fscache/object.c | 13 +- + fs/fuse/readdir.c | 2 +- + fs/namei.c | 4 +- + fs/namespace.c | 20 +- + fs/nfs/dir.c | 4 +- + fs/nfs/unlink.c | 4 +- + fs/proc/base.c | 3 +- + fs/proc/proc_sysctl.c | 2 +- + include/asm-generic/softirq_stack.h | 2 +- + include/linux/blk-mq.h | 11 + + include/linux/console.h | 36 +- + include/linux/dcache.h | 4 +- + include/linux/entry-common.h | 8 +- + include/linux/interrupt.h | 16 + + include/linux/irqdesc.h | 1 + + include/linux/irqflags.h | 23 +- + include/linux/local_lock_internal.h | 6 +- + include/linux/netdevice.h | 13 +- + include/linux/nfs_xdr.h | 2 +- + include/linux/preempt.h | 63 +- + include/linux/printk.h | 59 +- + include/linux/random.h | 3 +- + include/linux/ratelimit_types.h | 2 +- + include/linux/rcupdate.h | 7 + + include/linux/rtmutex.h | 9 + + include/linux/rwlock.h | 6 + + include/linux/rwlock_api_smp.h | 8 + + include/linux/rwlock_rt.h | 10 + + include/linux/sched.h | 130 +- + include/linux/sched/task_stack.h | 10 + + include/linux/serial_8250.h | 5 + + include/linux/smp.h | 3 + + include/linux/spinlock_api_up.h | 1 + + include/linux/spinlock_types_up.h | 2 +- + include/linux/thread_info.h | 12 +- + include/linux/trace_events.h | 5 +- + include/linux/u64_stats_sync.h | 42 +- + include/trace/events/net.h | 14 - + init/Kconfig | 4 + + init/main.c | 1 + + kernel/Kconfig.preempt | 6 + + kernel/cgroup/rstat.c | 5 +- + kernel/entry/common.c | 12 +- + kernel/exit.c | 2 + + kernel/fork.c | 262 ++-- + kernel/irq/chip.c | 4 +- + kernel/irq/handle.c | 11 +- + kernel/irq/internals.h | 2 +- + kernel/irq/irqdesc.c | 21 + + kernel/irq/manage.c | 3 + + kernel/ksysfs.c | 12 + + kernel/locking/lockdep.c | 2 + + kernel/locking/rtmutex.c | 5 +- + kernel/locking/rtmutex_api.c | 30 +- + kernel/locking/spinlock.c | 10 + + kernel/locking/spinlock_rt.c | 18 +- + kernel/locking/ww_rt_mutex.c | 2 +- + kernel/panic.c | 25 +- + kernel/printk/printk.c | 1088 +++++++++++---- + kernel/ptrace.c | 38 +- + kernel/rcu/tasks.h | 9 +- + kernel/rcu/tree.c | 7 +- + kernel/sched/core.c | 93 +- + kernel/sched/fair.c | 16 +- + kernel/sched/features.h | 3 + + kernel/sched/sched.h | 9 + + kernel/sched/swait.c | 1 + + kernel/signal.c | 36 +- + kernel/smp.c | 14 +- + kernel/softirq.c | 76 + + kernel/time/hrtimer.c | 4 +- + kernel/time/timer.c | 2 +- + kernel/trace/trace.c | 46 +- + kernel/trace/trace_events.c | 1 + + kernel/trace/trace_output.c | 14 +- + lib/dump_stack.c | 4 +- + lib/irq_poll.c | 2 + + lib/locking-selftest.c | 172 ++- + lib/nmi_backtrace.c | 4 +- + mm/memcontrol.c | 1241 +++++++++-------- + mm/vmalloc.c | 10 +- + mm/workingset.c | 5 +- + mm/zsmalloc.c | 529 +++---- + net/core/dev.c | 139 +- + net/core/link_watch.c | 4 +- + net/core/rtnetlink.c | 8 +- + net/hsr/hsr_device.c | 6 +- + net/ipv4/inet_hashtables.c | 53 +- + net/ipv6/inet6_hashtables.c | 5 +- + net/sunrpc/svc_xprt.c | 4 +- + 182 files changed, 3652 insertions(+), 1933 deletions(-) -diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst -index d2c4c27e1702..d83c9ab49427 100644 ---- a/Documentation/dev-tools/kcov.rst -+++ b/Documentation/dev-tools/kcov.rst -@@ -50,6 +50,7 @@ program using kcov: - #include <sys/mman.h> - #include <unistd.h> - #include <fcntl.h> -+ #include <linux/types.h> - - #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) - #define KCOV_ENABLE _IO('c', 100) -@@ -177,6 +178,8 @@ Comparison operands collection is similar to coverage collection: - /* Read number of comparisons collected. */ - n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); - for (i = 0; i < n; i++) { -+ uint64_t ip; -+ - type = cover[i * KCOV_WORDS_PER_CMP + 1]; - /* arg1 and arg2 - operands of the comparison. */ - arg1 = cover[i * KCOV_WORDS_PER_CMP + 2]; -@@ -251,6 +254,8 @@ selectively from different subsystems. - - .. code-block:: c - -+ /* Same includes and defines as above. */ -+ - struct kcov_remote_arg { - __u32 trace_mode; - __u32 area_size; +diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst +index faac50149a22..2cc502a75ef6 100644 +--- a/Documentation/admin-guide/cgroup-v1/memory.rst ++++ b/Documentation/admin-guide/cgroup-v1/memory.rst +@@ -64,6 +64,7 @@ Brief summary of control files. + threads + cgroup.procs show list of processes + cgroup.event_control an interface for event_fd() ++ This knob is not available on CONFIG_PREEMPT_RT systems. + memory.usage_in_bytes show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes show current usage for memory+Swap +@@ -75,6 +76,7 @@ Brief summary of control files. + memory.max_usage_in_bytes show max memory usage recorded + memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded + memory.soft_limit_in_bytes set/show soft limit of memory usage ++ This knob is not available on CONFIG_PREEMPT_RT systems. + memory.stat show various statistics + memory.use_hierarchy set/show hierarchical account enabled + This knob is deprecated and shouldn't be diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h index 1d5716bc060b..2526fd3be5fd 100644 --- a/arch/alpha/include/asm/spinlock_types.h @@ -294,10 +225,10 @@ index 1d5716bc060b..2526fd3be5fd 100644 #endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index 4ebd512043be..5ac2009727bd 100644 +index c2724d986fa0..7496417526be 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig -@@ -32,6 +32,7 @@ config ARM +@@ -33,6 +33,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE @@ -329,7 +260,7 @@ index 4ebd512043be..5ac2009727bd 100644 + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select RTC_LIB select SYS_SUPPORTS_APM_EMULATION - select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M + select THREAD_INFO_IN_TASK if CURRENT_POINTER_IN_TPIDRURO diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index 5976958647fe..0c14b36ef101 100644 --- a/arch/arm/include/asm/spinlock_types.h @@ -344,18 +275,18 @@ index 5976958647fe..0c14b36ef101 100644 #endif diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h -index 9a18da3e10cc..2fa63d96a4f0 100644 +index 164e15f26485..666da94ed9b7 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h -@@ -52,6 +52,7 @@ struct cpu_context_save { +@@ -54,6 +54,7 @@ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + #ifndef CONFIG_THREAD_INFO_IN_TASK struct task_struct *task; /* main task structure */ - __u32 cpu; /* cpu */ - __u32 cpu_domain; /* cpu domain */ -@@ -134,6 +135,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #endif +@@ -152,6 +153,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ @@ -363,7 +294,7 @@ index 9a18da3e10cc..2fa63d96a4f0 100644 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ -@@ -148,6 +150,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, +@@ -166,6 +168,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) @@ -371,7 +302,7 @@ index 9a18da3e10cc..2fa63d96a4f0 100644 #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) /* Checks for any syscall work in entry-common.S */ -@@ -157,7 +160,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, +@@ -175,7 +178,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, /* * Change these and you break ASM code in entry-common.S */ @@ -382,7 +313,7 @@ index 9a18da3e10cc..2fa63d96a4f0 100644 _TIF_NOTIFY_SIGNAL) diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c -index a646a3f6440f..beb09d74684f 100644 +index 645845e4982a..73e321c6d152 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -43,6 +43,7 @@ int main(void) @@ -390,14 +321,14 @@ index a646a3f6440f..beb09d74684f 100644 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); + #ifndef CONFIG_THREAD_INFO_IN_TASK DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); - DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); + #endif diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S -index 241b73d64df7..f3a9dd2e98c6 100644 +index 5cd057859fe9..4db90d80d175 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S -@@ -206,11 +206,18 @@ __irq_svc: +@@ -203,11 +203,18 @@ __irq_svc: #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -418,7 +349,7 @@ index 241b73d64df7..f3a9dd2e98c6 100644 #endif svc_exit r5, irq = 1 @ return from exception -@@ -225,8 +232,14 @@ svc_preempt: +@@ -222,8 +229,14 @@ svc_preempt: 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED @@ -448,25 +379,11 @@ index a41e27ace391..1e29cec7716f 100644 schedule(); } else { if (unlikely(!user_mode(regs))) -diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c -index 842427ff2b3c..b943e2df9540 100644 ---- a/arch/arm/kernel/smp.c -+++ b/arch/arm/kernel/smp.c -@@ -667,9 +667,7 @@ static void do_handle_IPI(int ipinr) - break; - - case IPI_CPU_BACKTRACE: -- printk_deferred_enter(); - nmi_cpu_backtrace(get_irq_regs()); -- printk_deferred_exit(); - break; - - default: diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c -index efa402025031..59487ee9fd61 100644 +index bc8779d54a64..12dba4284b21 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c -@@ -400,6 +400,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, +@@ -407,6 +407,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); @@ -476,7 +393,7 @@ index efa402025031..59487ee9fd61 100644 if (user_mode(regs)) goto bad_area; -@@ -470,6 +473,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, +@@ -477,6 +480,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -487,14 +404,14 @@ index efa402025031..59487ee9fd61 100644 return 0; } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index fee914c716aa..aeaa2c2ecc23 100644 +index c4207cf9bb17..260866cf53c9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig -@@ -88,6 +88,7 @@ config ARM64 +@@ -89,6 +89,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING -+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK ++ select ARCH_SUPPORTS_RT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @@ -503,19 +420,11 @@ index fee914c716aa..aeaa2c2ecc23 100644 select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_PREEMPT_LAZY + select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUTEX_CMPXCHG if FUTEX - select MMU_GATHER_RCU_TABLE_FREE -@@ -213,6 +215,7 @@ config ARM64 - select PCI_DOMAINS_GENERIC if PCI - select PCI_ECAM if (ACPI && PCI) - select PCI_SYSCALL if PCI -+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM - select POWER_RESET - select POWER_SUPPLY - select SPARSE_IRQ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h -index 72f95c6a7051..30fe8c324ce6 100644 +index c4ba047a82d2..7c83a6655d1c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1001,7 +1001,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, @@ -596,7 +505,7 @@ index 18782f0c4721..11ab1c077697 100644 #endif diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h -index 6623c99f0984..c55ccec33a5a 100644 +index e1317b7c4525..861594d9662d 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -26,6 +26,7 @@ struct thread_info { @@ -607,7 +516,7 @@ index 6623c99f0984..c55ccec33a5a 100644 union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { -@@ -67,6 +68,7 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst, #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ @@ -615,7 +524,7 @@ index 6623c99f0984..c55ccec33a5a 100644 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ -@@ -97,8 +99,10 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -98,8 +100,10 @@ int arch_dup_task_struct(struct task_struct *dst, #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) @@ -627,7 +536,7 @@ index 6623c99f0984..c55ccec33a5a 100644 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ _TIF_NOTIFY_SIGNAL) -@@ -107,6 +111,8 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -108,6 +112,8 @@ int arch_dup_task_struct(struct task_struct *dst, _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) @@ -637,11 +546,11 @@ index 6623c99f0984..c55ccec33a5a 100644 #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c -index 551427ae8cc5..96a4f6c9eb78 100644 +index 6d0c3afd36b8..9b11f996b8ea 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c -@@ -31,6 +31,7 @@ int main(void) - BLANK(); +@@ -32,6 +32,7 @@ int main(void) + DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); @@ -649,10 +558,10 @@ index 551427ae8cc5..96a4f6c9eb78 100644 DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c -index ff4962750b3d..99484e8bbade 100644 +index fa244c426f61..38ebf3382002 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c -@@ -179,10 +179,19 @@ static void __get_cpu_fpsimd_context(void) +@@ -201,10 +201,19 @@ static void __get_cpu_fpsimd_context(void) * * The double-underscore version must only be called if you know the task * can't be preempted. @@ -673,7 +582,7 @@ index ff4962750b3d..99484e8bbade 100644 __get_cpu_fpsimd_context(); } -@@ -203,7 +212,10 @@ static void __put_cpu_fpsimd_context(void) +@@ -225,7 +234,10 @@ static void __put_cpu_fpsimd_context(void) static void put_cpu_fpsimd_context(void) { __put_cpu_fpsimd_context(); @@ -685,15 +594,16 @@ index ff4962750b3d..99484e8bbade 100644 } static bool have_cpu_fpsimd_context(void) -@@ -1033,6 +1045,7 @@ void fpsimd_thread_switch(struct task_struct *next) +@@ -1125,6 +1137,8 @@ static void fpsimd_flush_thread_vl(enum vec_type type) + void fpsimd_flush_thread(void) { - int vl, supported_vl; + void *sve_state = NULL; - ++ if (!system_supports_fpsimd()) return; -@@ -1045,7 +1058,10 @@ void fpsimd_flush_thread(void) + +@@ -1136,11 +1150,16 @@ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); @@ -702,10 +612,8 @@ index ff4962750b3d..99484e8bbade 100644 + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; - - /* - * Reset the task vector length as required. -@@ -1079,6 +1095,7 @@ void fpsimd_flush_thread(void) ++ + fpsimd_flush_thread_vl(ARM64_VEC_SVE); } put_cpu_fpsimd_context(); @@ -714,7 +622,7 @@ index ff4962750b3d..99484e8bbade 100644 /* diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c -index c287b9407f28..1d65f2801e13 100644 +index 8f6372b44b65..ab23598fdeb0 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -920,7 +920,7 @@ static void do_signal(struct pt_regs *regs) @@ -742,10 +650,10 @@ index c287b9407f28..1d65f2801e13 100644 uprobe_notify_resume(regs); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c -index 9b328bb05596..12b29d851012 100644 +index e4727dc771bf..08508fc5fa17 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c -@@ -811,7 +811,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +@@ -820,7 +820,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * involves poking the GIC, which must be done in a * non-preemptible context. */ @@ -754,7 +662,7 @@ index 9b328bb05596..12b29d851012 100644 kvm_pmu_flush_hwstate(vcpu); -@@ -835,7 +835,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +@@ -844,7 +844,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); @@ -763,7 +671,7 @@ index 9b328bb05596..12b29d851012 100644 continue; } -@@ -907,7 +907,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +@@ -916,7 +916,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); @@ -811,11 +719,34 @@ index 6e345fefcdca..14b8a161c165 100644 # error "please don't include this file directly" #endif +diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h +index 51d20cb37706..1684716f0820 100644 +--- a/arch/ia64/include/asm/thread_info.h ++++ b/arch/ia64/include/asm/thread_info.h +@@ -55,15 +55,15 @@ struct thread_info { + #ifndef ASM_OFFSETS_C + /* how to get the thread information struct from C */ + #define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE)) +-#define alloc_thread_stack_node(tsk, node) \ ++#define arch_alloc_thread_stack_node(tsk, node) \ + ((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE)) + #define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) + #else + #define current_thread_info() ((struct thread_info *) 0) +-#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) ++#define arch_alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) + #define task_thread_info(tsk) ((struct thread_info *) 0) + #endif +-#define free_thread_stack(tsk) /* nothing */ ++#define arch_free_thread_stack(tsk) /* nothing */ + #define task_stack_page(tsk) ((void *)(tsk)) + + #define __HAVE_THREAD_FUNCTIONS diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index 6b9f523882c5..3eec2a6395e4 100644 +index dea74d7717c0..6255e4d37539 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig -@@ -151,6 +151,7 @@ config PPC +@@ -153,6 +153,7 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x @@ -823,7 +754,7 @@ index 6b9f523882c5..3eec2a6395e4 100644 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST -@@ -219,6 +220,7 @@ config PPC +@@ -221,6 +222,7 @@ config PPC select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_IRQ_TIME_ACCOUNTING @@ -831,7 +762,7 @@ index 6b9f523882c5..3eec2a6395e4 100644 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE select HAVE_KERNEL_LZO if DEFAULT_UIMAGE -@@ -235,6 +237,7 @@ config PPC +@@ -237,6 +239,7 @@ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP @@ -852,18 +783,6 @@ index 0f3cdd8faa95..08243338069d 100644 # error "please don't include this file directly" #endif -diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h -index 7ef1cd8168a0..f9e63cacd220 100644 ---- a/arch/powerpc/include/asm/smp.h -+++ b/arch/powerpc/include/asm/smp.h -@@ -62,6 +62,7 @@ struct smp_ops_t { - - extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); - extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); -+extern void smp_send_debugger_break_cpu(unsigned int cpu); - extern void smp_send_debugger_break(void); - extern void start_secondary_resume(void); - extern void smp_generic_give_timebase(void); diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index c5d742f18021..d5f8a74ed2e8 100644 --- a/arch/powerpc/include/asm/spinlock_types.h @@ -894,19 +813,27 @@ index 1c8460e23583..b1653c160bab 100644 canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h -index b4ec6c7dd72e..07df83231ec2 100644 +index 5725029aaa29..829315ee9c56 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -47,6 +47,8 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ -+ int preempt_lazy_count; /* 0 => preemptable, ++ int preempt_lazy_count; /* 0 => preemptable, + <0 => BUG */ - unsigned long local_flags; /* private flags for thread */ - #ifdef CONFIG_LIVEPATCH - unsigned long *livepatch_sp; -@@ -93,6 +95,7 @@ void arch_setup_new_exec(void); + #ifdef CONFIG_SMP + unsigned int cpu; + #endif +@@ -71,6 +73,7 @@ struct thread_info { + #define INIT_THREAD_INFO(tsk) \ + { \ + .preempt_count = INIT_PREEMPT_COUNT, \ ++ .preempt_lazy_count = 0, \ + .flags = 0, \ + } + +@@ -96,6 +99,7 @@ void arch_setup_new_exec(void); #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SINGLESTEP 8 /* singlestepping active */ @@ -914,7 +841,7 @@ index b4ec6c7dd72e..07df83231ec2 100644 #define TIF_SECCOMP 10 /* secure computing */ #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 12 /* Force successful syscall return */ -@@ -108,6 +111,7 @@ void arch_setup_new_exec(void); +@@ -111,6 +115,7 @@ void arch_setup_new_exec(void); #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 20 /* 32 bit binary */ @@ -922,7 +849,7 @@ index b4ec6c7dd72e..07df83231ec2 100644 /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) -@@ -119,6 +123,7 @@ void arch_setup_new_exec(void); +@@ -122,6 +127,7 @@ void arch_setup_new_exec(void); #define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) @@ -930,7 +857,7 @@ index b4ec6c7dd72e..07df83231ec2 100644 #define _TIF_SECCOMP (1<<TIF_SECCOMP) #define _TIF_RESTOREALL (1<<TIF_RESTOREALL) #define _TIF_NOERROR (1<<TIF_NOERROR) -@@ -132,10 +137,12 @@ void arch_setup_new_exec(void); +@@ -135,10 +141,12 @@ void arch_setup_new_exec(void); _TIF_SYSCALL_EMU) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ @@ -1006,47 +933,6 @@ index c4f1d6b7d992..02e17a57da83 100644 irq_hw_number_t virq_to_hw(unsigned int virq) { -diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c -index bdee7262c080..d57d37497862 100644 ---- a/arch/powerpc/kernel/kgdb.c -+++ b/arch/powerpc/kernel/kgdb.c -@@ -120,11 +120,19 @@ int kgdb_skipexception(int exception, struct pt_regs *regs) - - static int kgdb_debugger_ipi(struct pt_regs *regs) - { -- kgdb_nmicallback(raw_smp_processor_id(), regs); -+ int cpu = raw_smp_processor_id(); -+ -+ if (!kgdb_roundup_delay(cpu)) -+ kgdb_nmicallback(cpu, regs); - return 0; - } - - #ifdef CONFIG_SMP -+void kgdb_roundup_cpu(unsigned int cpu) -+{ -+ smp_send_debugger_break_cpu(cpu); -+} -+ - void kgdb_roundup_cpus(void) - { - smp_send_debugger_break(); -diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c -index 605bab448f84..26c2179cf64a 100644 ---- a/arch/powerpc/kernel/smp.c -+++ b/arch/powerpc/kernel/smp.c -@@ -589,6 +589,11 @@ static void debugger_ipi_callback(struct pt_regs *regs) - debugger_ipi(regs); - } - -+void smp_send_debugger_break_cpu(unsigned int cpu) -+{ -+ smp_send_nmi_ipi(cpu, debugger_ipi_callback, 1000000); -+} -+ - void smp_send_debugger_break(void) - { - smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 11741703d26e..7e4e1f489f56 100644 --- a/arch/powerpc/kernel/traps.c @@ -1070,20 +956,6 @@ index 11741703d26e..7e4e1f489f56 100644 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", -diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c -index 22ceeeb705ab..d5359701f787 100644 ---- a/arch/powerpc/kexec/crash.c -+++ b/arch/powerpc/kexec/crash.c -@@ -312,9 +312,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs) - unsigned int i; - int (*old_handler)(struct pt_regs *regs); - -- /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ -- printk_deferred_enter(); -- - /* - * This function is only called after the system - * has panicked or is otherwise in a critical state. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index ff581d70f20c..e5c84d55bdfb 100644 --- a/arch/powerpc/kvm/Kconfig @@ -1097,7 +969,7 @@ index ff581d70f20c..e5c84d55bdfb 100644 select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c -index 8322ca86d5ac..f524145d7dd3 100644 +index 8f998e55735b..637b015d6900 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -24,6 +24,7 @@ @@ -1271,10 +1143,10 @@ index c8848bb681a1..41fa1be980a3 100644 #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(void) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 1f96809606ac..3493d2b94530 100644 +index 5c2ccb85f2ef..34ce7f969e28 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig -@@ -107,6 +107,7 @@ config X86 +@@ -108,6 +108,7 @@ config X86 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN @@ -1282,7 +1154,7 @@ index 1f96809606ac..3493d2b94530 100644 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS -@@ -230,6 +231,7 @@ config X86 +@@ -234,6 +235,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP @@ -1290,27 +1162,18 @@ index 1f96809606ac..3493d2b94530 100644 select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API -diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h -index 8d55bd11848c..f755c217f67b 100644 ---- a/arch/x86/include/asm/irq_stack.h -+++ b/arch/x86/include/asm/irq_stack.h -@@ -201,6 +201,7 @@ - IRQ_CONSTRAINTS, regs, vector); \ - } - -+#ifndef CONFIG_PREEMPT_RT - /* - * Macro to invoke __do_softirq on the irq stack. This is only called from - * task context when bottom halves are about to be reenabled and soft -@@ -214,6 +215,8 @@ - __this_cpu_write(hardirq_stack_inuse, false); \ - } +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 448cd01eb3ec..a34430b7af4a 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -22,6 +22,7 @@ + #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) -+#endif -+ - #else /* CONFIG_X86_64 */ - /* System vector handlers always run on the stack they interrupted. */ - #define run_sysvec_on_irqstack_cond(func, regs) \ + #ifndef __ASSEMBLY__ ++#include <linux/spinlock.h> + #include <asm/x86_init.h> + #include <asm/pkru.h> + #include <asm/fpu/api.h> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index fe5efbcba824..ab8cb5fc2329 100644 --- a/arch/x86/include/asm/preempt.h @@ -1389,46 +1252,20 @@ index 2dfb5fea13af..fc03f4f7ed84 100644 #ifndef CONFIG_COMPAT #define compat_sigset_t compat_sigset_t typedef sigset_t compat_sigset_t; -diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h -index 24a8d6c4fb18..2fc22c27df18 100644 ---- a/arch/x86/include/asm/stackprotector.h -+++ b/arch/x86/include/asm/stackprotector.h -@@ -50,7 +50,7 @@ - */ - static __always_inline void boot_init_stack_canary(void) - { -- u64 canary; -+ u64 canary = 0; - u64 tsc; - - #ifdef CONFIG_X86_64 -@@ -61,8 +61,14 @@ static __always_inline void boot_init_stack_canary(void) - * of randomness. The TSC only matters for very early init, - * there it already has some randomness on most systems. Later - * on during the bootup the random pool has true entropy too. -+ * For preempt-rt we need to weaken the randomness a bit, as -+ * we can't call into the random generator from atomic context -+ * due to locking constraints. We just leave canary -+ * uninitialized and use the TSC based randomness on top of it. - */ -+#ifndef CONFIG_PREEMPT_RT - get_random_bytes(&canary, sizeof(canary)); -+#endif - tsc = rdtsc(); - canary += tsc + (tsc << 32UL); - canary &= CANARY_MASK; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h -index cf132663c219..75dc786e6365 100644 +index ebec69c35e95..39005bff5b8f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h -@@ -57,11 +57,14 @@ struct thread_info { +@@ -57,6 +57,8 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ + int preempt_lazy_count; /* 0 => lazy preemptable -+ <0 => BUG */ - }; - ++ <0 => BUG */ + #ifdef CONFIG_SMP + u32 cpu; /* current CPU */ + #endif +@@ -65,6 +67,7 @@ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .flags = 0, \ @@ -1436,7 +1273,7 @@ index cf132663c219..75dc786e6365 100644 } #else /* !__ASSEMBLY__ */ -@@ -90,6 +93,7 @@ struct thread_info { +@@ -93,6 +96,7 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ @@ -1444,7 +1281,7 @@ index cf132663c219..75dc786e6365 100644 #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ -@@ -114,6 +118,7 @@ struct thread_info { +@@ -117,6 +121,7 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) @@ -1453,185 +1290,35 @@ index cf132663c219..75dc786e6365 100644 #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c -index ef6316fef99f..86974cd60942 100644 +index ff55df60228f..2a0f83678911 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c -@@ -75,11 +75,12 @@ void hv_remove_vmbus_handler(void) - DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) - { - struct pt_regs *old_regs = set_irq_regs(regs); -+ u64 ip = regs ? instruction_pointer(regs) : 0; - +@@ -79,7 +79,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); -+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0, ip); ++ add_interrupt_randomness(HYPERV_STIMER0_VECTOR); ack_APIC_irq(); set_irq_regs(old_regs); -diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c -index 722fd712e1cf..82cc3a7be6bd 100644 ---- a/arch/x86/kernel/dumpstack_32.c -+++ b/arch/x86/kernel/dumpstack_32.c -@@ -141,7 +141,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, - */ - if (visit_mask) { - if (*visit_mask & (1UL << info->type)) { -- printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); -+ pr_warn_once("WARNING: stack recursion on stack type %d\n", info->type); - goto unknown; - } - *visit_mask |= 1UL << info->type; -diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c -index 6c5defd6569a..5f725b0ceb29 100644 ---- a/arch/x86/kernel/dumpstack_64.c -+++ b/arch/x86/kernel/dumpstack_64.c -@@ -207,7 +207,8 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, - if (visit_mask) { - if (*visit_mask & (1UL << info->type)) { - if (task == current) -- printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); -+ pr_warn_once("WARNING: stack recursion on stack type %d\n", -+ info->type); - goto unknown; - } - *visit_mask |= 1UL << info->type; -diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c -index 15aefa3f3e18..52af9a89ad47 100644 ---- a/arch/x86/kernel/i8259.c -+++ b/arch/x86/kernel/i8259.c -@@ -207,8 +207,7 @@ static void mask_and_ack_8259A(struct irq_data *data) - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { -- printk_deferred(KERN_DEBUG -- "spurious 8259A interrupt: IRQ%d.\n", irq); -+ printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); -diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c -index 044902d5a3c4..e5dd6da78713 100644 ---- a/arch/x86/kernel/irq_32.c -+++ b/arch/x86/kernel/irq_32.c -@@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) - return 0; - } - -+#ifndef CONFIG_PREEMPT_RT - void do_softirq_own_stack(void) - { - struct irq_stack *irqstk; -@@ -148,6 +149,7 @@ void do_softirq_own_stack(void) - - call_on_stack(__do_softirq, isp); - } -+#endif - - void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) - { -diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c -index 3a43a2dee658..37bd37cdf2b6 100644 ---- a/arch/x86/kernel/kgdb.c -+++ b/arch/x86/kernel/kgdb.c -@@ -502,9 +502,12 @@ static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) - if (atomic_read(&kgdb_active) != -1) { - /* KGDB CPU roundup */ - cpu = raw_smp_processor_id(); -- kgdb_nmicallback(cpu, regs); -- set_bit(cpu, was_in_debug_nmi); -- touch_nmi_watchdog(); -+ -+ if (!kgdb_roundup_delay(cpu)) { -+ kgdb_nmicallback(cpu, regs); -+ set_bit(cpu, was_in_debug_nmi); -+ touch_nmi_watchdog(); -+ } - - return NMI_HANDLED; - } -diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c -index d7c44b257f7f..2d0361cd304f 100644 ---- a/arch/x86/kernel/unwind_frame.c -+++ b/arch/x86/kernel/unwind_frame.c -@@ -41,9 +41,9 @@ static void unwind_dump(struct unwind_state *state) - - dumped_before = true; - -- printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", -- state->stack_info.type, state->stack_info.next_sp, -- state->stack_mask, state->graph_idx); -+ printk("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", -+ state->stack_info.type, state->stack_info.next_sp, -+ state->stack_mask, state->graph_idx); - - for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; - sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { -@@ -59,13 +59,11 @@ static void unwind_dump(struct unwind_state *state) - - if (zero) { - if (!prev_zero) -- printk_deferred("%p: %0*x ...\n", -- sp, BITS_PER_LONG/4, 0); -+ printk("%p: %0*x ...\n", sp, BITS_PER_LONG/4, 0); - continue; - } - -- printk_deferred("%p: %0*lx (%pB)\n", -- sp, BITS_PER_LONG/4, word, (void *)word); -+ printk("%p: %0*lx (%pB)\n", sp, BITS_PER_LONG/4, word, (void *)word); - } - } - } -@@ -342,13 +340,13 @@ bool unwind_next_frame(struct unwind_state *state) - goto the_end; - - if (state->regs) { -- printk_deferred_once(KERN_WARNING -+ pr_warn_once( - "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", - state->regs, state->task->comm, - state->task->pid, next_bp); - unwind_dump(state); - } else { -- printk_deferred_once(KERN_WARNING -+ pr_warn_once( - "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", - state->bp, state->task->comm, - state->task->pid, next_bp); -diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c -index a1202536fc57..a26a7c3849f5 100644 ---- a/arch/x86/kernel/unwind_orc.c -+++ b/arch/x86/kernel/unwind_orc.c -@@ -9,7 +9,7 @@ - #include <asm/orc_lookup.h> - - #define orc_warn(fmt, ...) \ -- printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) -+ pr_warn_once("WARNING: " fmt, ##__VA_ARGS__) - - #define orc_warn_current(args...) \ - ({ \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -index eff065ce6f8e..d47927a4e8c3 100644 +index 0b5c61bb24a1..9dbf870229bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c -@@ -8511,6 +8511,14 @@ int kvm_arch_init(void *opaque) +@@ -8655,6 +8655,12 @@ int kvm_arch_init(void *opaque) goto out; } -+#ifdef CONFIG_PREEMPT_RT -+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + r = -EOPNOTSUPP; + goto out; + } -+#endif + r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), - __alignof__(struct fpu), SLAB_ACCOUNT, + + x86_emulator_cache = kvm_alloc_emulator_cache(); diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h index 64c9389254f1..797aed7df3dd 100644 --- a/arch/xtensa/include/asm/spinlock_types.h @@ -1646,10 +1333,10 @@ index 64c9389254f1..797aed7df3dd 100644 #endif diff --git a/block/blk-mq.c b/block/blk-mq.c -index 82de39926a9f..330b6274bf6b 100644 +index 8874a63ae952..1f7569d135fa 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c -@@ -1563,14 +1563,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, +@@ -1857,14 +1857,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { @@ -1728,29 +1415,8 @@ index a1bea0f4baa8..5f8ca8c1f59c 100644 if (!req) return; -diff --git a/crypto/testmgr.c b/crypto/testmgr.c -index 70f69f0910c9..58eee8eab4bf 100644 ---- a/crypto/testmgr.c -+++ b/crypto/testmgr.c -@@ -1061,14 +1061,14 @@ static void generate_random_testvec_config(struct testvec_config *cfg, - - static void crypto_disable_simd_for_test(void) - { -- preempt_disable(); -+ migrate_disable(); - __this_cpu_write(crypto_simd_disabled_for_test, true); - } - - static void crypto_reenable_simd_for_test(void) - { - __this_cpu_write(crypto_simd_disabled_for_test, false); -- preempt_enable(); -+ migrate_enable(); - } - - /* diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c -index 6383c81ac5b3..abb695f5f5e4 100644 +index 25071126995b..6ff2bcfb9d0e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); @@ -1802,7 +1468,7 @@ index 6383c81ac5b3..abb695f5f5e4 100644 static inline bool init_done(struct zram *zram) { -@@ -1169,6 +1204,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) +@@ -1199,6 +1234,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); @@ -1823,53 +1489,126 @@ index 80c3b43b4828..d8f6d880f915 100644 ktime_t ac_time; #endif diff --git a/drivers/char/random.c b/drivers/char/random.c -index 605969ed0f96..56b2d5a7e2a0 100644 +index 7470ee24db2f..4b93ca6ecef9 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c -@@ -1242,26 +1242,25 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) +@@ -200,7 +200,7 @@ + * void add_device_randomness(const void *buf, unsigned int size); + * void add_input_randomness(unsigned int type, unsigned int code, + * unsigned int value); +- * void add_interrupt_randomness(int irq, int irq_flags); ++ * void add_interrupt_randomness(int irq); + * void add_disk_randomness(struct gendisk *disk); + * + * add_device_randomness() is for adding data to the random pool that +@@ -1260,9 +1260,65 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) return *ptr; } -void add_interrupt_randomness(int irq, int irq_flags) -+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) ++static bool process_interrupt_randomness_pool(struct fast_pool *fast_pool) { struct entropy_store *r; ++ ++ if (unlikely(crng_init == 0)) { ++ bool pool_reset = false; ++ ++ if ((fast_pool->count >= 64) && ++ crng_fast_load((char *) fast_pool->pool, ++ sizeof(fast_pool->pool))) ++ pool_reset = true; ++ ++ return pool_reset; ++ } ++ ++ if ((fast_pool->count < 64) && ++ !time_after(jiffies, fast_pool->last + HZ)) ++ return false; ++ ++ r = &input_pool; ++ if (!spin_trylock(&r->lock)) ++ return false; ++ ++ __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool)); ++ spin_unlock(&r->lock); ++ ++ /* award one bit for the contents of the fast pool */ ++ credit_entropy_bits(r, 1); ++ return true; ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++void process_interrupt_randomness(void) ++{ ++ struct fast_pool *cpu_pool; ++ struct fast_pool fast_pool; ++ ++ lockdep_assert_irqs_enabled(); ++ ++ migrate_disable(); ++ cpu_pool = this_cpu_ptr(&irq_randomness); ++ ++ local_irq_disable(); ++ memcpy(&fast_pool, cpu_pool, sizeof(fast_pool)); ++ local_irq_enable(); ++ ++ if (process_interrupt_randomness_pool(&fast_pool)) { ++ local_irq_disable(); ++ cpu_pool->last = jiffies; ++ cpu_pool->count = 0; ++ local_irq_enable(); ++ } ++ memzero_explicit(&fast_pool, sizeof(fast_pool)); ++ migrate_enable(); ++} ++#endif ++ ++void add_interrupt_randomness(int irq) ++{ struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness); -- struct pt_regs *regs = get_irq_regs(); + struct pt_regs *regs = get_irq_regs(); unsigned long now = jiffies; - cycles_t cycles = random_get_entropy(); - __u32 c_high, j_high; -- __u64 ip; - - if (cycles == 0) -- cycles = get_reg(fast_pool, regs); -+ cycles = get_reg(fast_pool, NULL); - c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0; - j_high = (sizeof(now) > 4) ? now >> 32 : 0; - fast_pool->pool[0] ^= cycles ^ j_high ^ irq; - fast_pool->pool[1] ^= now ^ c_high; -- ip = regs ? instruction_pointer(regs) : _RET_IP_; -+ if (!ip) -+ ip = _RET_IP_; - fast_pool->pool[2] ^= ip; - fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 : -- get_reg(fast_pool, regs); -+ get_reg(fast_pool, NULL); - +@@ -1284,32 +1340,17 @@ void add_interrupt_randomness(int irq, int irq_flags) fast_mix(fast_pool); add_interrupt_bench(cycles); -@@ -1507,9 +1506,8 @@ static void _warn_unseeded_randomness(const char *func_name, void *caller, - print_once = true; - #endif - if (__ratelimit(&unseeded_warning)) -- printk_deferred(KERN_NOTICE "random: %s called from %pS " -- "with crng_init=%d\n", func_name, caller, -- crng_init); -+ pr_notice("random: %s called from %pS with crng_init=%d\n", -+ func_name, caller, crng_init); + +- if (unlikely(crng_init == 0)) { +- if ((fast_pool->count >= 64) && +- crng_fast_load((char *) fast_pool->pool, +- sizeof(fast_pool->pool))) { +- fast_pool->count = 0; ++ /* ++ * On PREEMPT_RT the entropy can not be fed into the input_pool because ++ * it needs to acquire sleeping locks with disabled interrupts. ++ * This is deferred to the threaded handler. ++ */ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ if (process_interrupt_randomness_pool(fast_pool)) { + fast_pool->last = now; ++ fast_pool->count = 0; + } +- return; + } +- +- if ((fast_pool->count < 64) && +- !time_after(now, fast_pool->last + HZ)) +- return; +- +- r = &input_pool; +- if (!spin_trylock(&r->lock)) +- return; +- +- fast_pool->last = now; +- __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool)); +- spin_unlock(&r->lock); +- +- fast_pool->count = 0; +- +- /* award one bit for the contents of the fast pool */ +- credit_entropy_bits(r, 1); } + EXPORT_SYMBOL_GPL(add_interrupt_randomness); - /* diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index d3f2e5364c27..9c4a99757afd 100644 --- a/drivers/char/tpm/tpm_tis.c @@ -1924,29 +1663,6 @@ index d3f2e5364c27..9c4a99757afd 100644 return 0; } -diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c -index 847f33ffc4ae..ae79c3300129 100644 ---- a/drivers/firmware/efi/efi.c -+++ b/drivers/firmware/efi/efi.c -@@ -66,7 +66,7 @@ struct mm_struct efi_mm = { - - struct workqueue_struct *efi_rts_wq; - --static bool disable_runtime; -+static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); - static int __init setup_noefi(char *arg) - { - disable_runtime = true; -@@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) - if (parse_option_str(str, "noruntime")) - disable_runtime = true; - -+ if (parse_option_str(str, "runtime")) -+ disable_runtime = false; -+ - if (parse_option_str(str, "nosoftreserve")) - set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); - diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c index 254e67141a77..7a39029b083f 100644 --- a/drivers/gpu/drm/i915/display/intel_crtc.c @@ -2015,33 +1731,33 @@ index 209cf265bf74..6e1b9068d944 100644 } diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h -index c41098950746..601274ba86e4 100644 +index 246c37d72cd7..d8c74bbf9aae 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h -@@ -163,7 +163,8 @@ static inline void intel_context_enter(struct intel_context *ce) +@@ -211,7 +211,8 @@ static inline void intel_context_enter(struct intel_context *ce) static inline void intel_context_mark_active(struct intel_context *ce) { - lockdep_assert_held(&ce->timeline->mutex); + lockdep_assert(lockdep_is_held(&ce->timeline->mutex) || -+ test_bit(CONTEXT_IS_PARKED, &ce->flags)); ++ test_bit(CONTEXT_IS_PARKING, &ce->flags)); ++ce->active_count; } diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h -index e54351a170e2..1022be795e68 100644 +index 9e0177dc5484..30cd81ad8911 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h -@@ -112,6 +112,7 @@ struct intel_context { - #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 - #define CONTEXT_NOPREEMPT 8 +@@ -118,6 +118,7 @@ struct intel_context { #define CONTEXT_LRCA_DIRTY 9 -+#define CONTEXT_IS_PARKED 10 + #define CONTEXT_GUC_INIT 10 + #define CONTEXT_PERMA_PIN 11 ++#define CONTEXT_IS_PARKING 12 struct { u64 timeout_us; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c -index 1f07ac4e0672..e84f03a276d1 100644 +index a1334b48dde7..a8a2ad44b7e3 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -80,39 +80,6 @@ static int __engine_unpark(struct intel_wakeref *wf) @@ -2091,30 +1807,30 @@ index 1f07ac4e0672..e84f03a276d1 100644 - unsigned long flags; bool result = true; - /* GPU is pointing to the void, as good as in the kernel context. */ -@@ -201,7 +167,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) + /* +@@ -214,7 +180,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) * engine->wakeref.count, we may see the request completion and retire * it causing an underflow of the engine->wakeref. */ - flags = __timeline_mark_lock(ce); -+ set_bit(CONTEXT_IS_PARKED, &ce->flags); ++ set_bit(CONTEXT_IS_PARKING, &ce->flags); GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0); rq = __i915_request_create(ce, GFP_NOWAIT); -@@ -233,7 +199,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) +@@ -246,7 +212,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) result = false; out_unlock: - __timeline_mark_unlock(ce, flags); -+ clear_bit(CONTEXT_IS_PARKED, &ce->flags); ++ clear_bit(CONTEXT_IS_PARKING, &ce->flags); return result; } diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -index cafb0608ffb4..07156996fc82 100644 +index bedb80057046..1dbcac05f44e 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -@@ -1283,7 +1283,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1284,7 +1284,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ @@ -2123,7 +1839,7 @@ index cafb0608ffb4..07156996fc82 100644 /* * If the queue is higher priority than the last -@@ -1383,7 +1383,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1384,7 +1384,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ @@ -2132,7 +1848,7 @@ index cafb0608ffb4..07156996fc82 100644 return; } } -@@ -1409,7 +1409,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1410,7 +1410,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); @@ -2141,7 +1857,7 @@ index cafb0608ffb4..07156996fc82 100644 return; /* leave this for another sibling */ } -@@ -1571,7 +1571,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1572,7 +1572,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); @@ -2150,7 +1866,7 @@ index cafb0608ffb4..07156996fc82 100644 /* * We can skip poking the HW if we ended up with exactly the same set -@@ -1597,13 +1597,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1598,13 +1598,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) } } @@ -2164,7 +1880,7 @@ index cafb0608ffb4..07156996fc82 100644 static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); -@@ -2423,7 +2416,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) +@@ -2424,7 +2417,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) } if (!engine->execlists.pending[0]) { @@ -2174,10 +1890,10 @@ index cafb0608ffb4..07156996fc82 100644 } diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c -index 9bc4f4a8e12e..547347241a47 100644 +index 77680bca46ee..be8faaaa6022 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c -@@ -886,7 +886,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +@@ -916,7 +916,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); @@ -2187,7 +1903,7 @@ index 9bc4f4a8e12e..547347241a47 100644 /* Get optional system timestamp before query. */ if (stime) -@@ -950,7 +951,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +@@ -980,7 +981,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, if (etime) *etime = ktime_get(); @@ -2198,10 +1914,10 @@ index 9bc4f4a8e12e..547347241a47 100644 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c -index 79da5eca60af..b9dd6100c6d1 100644 +index 89cccefeea63..4665a4d4924e 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c -@@ -559,7 +559,6 @@ bool __i915_request_submit(struct i915_request *request) +@@ -560,7 +560,6 @@ bool __i915_request_submit(struct i915_request *request) RQ_TRACE(request, "\n"); @@ -2209,7 +1925,7 @@ index 79da5eca60af..b9dd6100c6d1 100644 lockdep_assert_held(&engine->sched_engine->lock); /* -@@ -668,7 +667,6 @@ void __i915_request_unsubmit(struct i915_request *request) +@@ -669,7 +668,6 @@ void __i915_request_unsubmit(struct i915_request *request) */ RQ_TRACE(request, "\n"); @@ -2218,21 +1934,21 @@ index 79da5eca60af..b9dd6100c6d1 100644 /* diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h -index 1bc1349ba3c2..a2f713b4ac2f 100644 +index dc359242d1ae..b7fe67405fd3 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h -@@ -609,7 +609,8 @@ i915_request_timeline(const struct i915_request *rq) +@@ -642,7 +642,8 @@ i915_request_timeline(const struct i915_request *rq) { /* Valid only while the request is being constructed (or retired). */ return rcu_dereference_protected(rq->timeline, - lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex)); + lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex) || -+ test_bit(CONTEXT_IS_PARKED, &rq->context->flags)); ++ test_bit(CONTEXT_IS_PARKING, &rq->context->flags)); } static inline struct i915_gem_context * diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h -index 63fec1c3c132..f345a0f12bf6 100644 +index 8104981a6604..89a4089bc4ba 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -2,6 +2,10 @@ @@ -2256,10 +1972,10 @@ index 63fec1c3c132..f345a0f12bf6 100644 TP_PROTO(struct i915_request *rq), TP_ARGS(rq) diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h -index 5259edacde38..b36b27c09049 100644 +index 7a5925072466..b7b56fb1e2fc 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h -@@ -343,7 +343,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) +@@ -344,7 +344,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ @@ -2268,62 +1984,56 @@ index 5259edacde38..b36b27c09049 100644 # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) -diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h -index d030577ad6a2..ef1db3367df7 100644 ---- a/drivers/hv/hyperv_vmbus.h -+++ b/drivers/hv/hyperv_vmbus.h -@@ -19,6 +19,7 @@ - #include <linux/atomic.h> - #include <linux/hyperv.h> - #include <linux/interrupt.h> -+#include <linux/irq.h> - - #include "hv_trace.h" - diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c -index 392c1ac4f819..c5e9725fb5ff 100644 +index 392c1ac4f819..7ae04ccb1043 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c -@@ -22,6 +22,7 @@ - #include <linux/clockchips.h> - #include <linux/cpu.h> - #include <linux/sched/task_stack.h> -+#include <linux/irq.h> - - #include <linux/delay.h> - #include <linux/notifier.h> -@@ -1337,6 +1338,8 @@ static void vmbus_isr(void) - void *page_addr = hv_cpu->synic_event_page; - struct hv_message *msg; - union hv_synic_event_flags *event; -+ struct pt_regs *regs = get_irq_regs(); -+ u64 ip = regs ? instruction_pointer(regs) : 0; - bool handled = false; - - if (unlikely(page_addr == NULL)) -@@ -1381,7 +1384,7 @@ static void vmbus_isr(void) +@@ -1381,7 +1381,7 @@ static void vmbus_isr(void) tasklet_schedule(&hv_cpu->msg_dpc); } - add_interrupt_randomness(vmbus_interrupt, 0); -+ add_interrupt_randomness(vmbus_interrupt, 0, ip); ++ add_interrupt_randomness(vmbus_interrupt); } static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) -diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig -index 1f1d57288085..dc6816d36d06 100644 ---- a/drivers/leds/trigger/Kconfig -+++ b/drivers/leds/trigger/Kconfig -@@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT - - config LEDS_TRIGGER_CPU - bool "LED CPU Trigger" -+ depends on !PREEMPT_RT - help - This allows LEDs to be controlled by active CPUs. This shows - the active CPUs across an array of LEDs so you can see which +diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c +index 1cf68f85b2e1..8ccf0c928bb4 100644 +--- a/drivers/i2c/busses/i2c-cht-wc.c ++++ b/drivers/i2c/busses/i2c-cht-wc.c +@@ -99,15 +99,8 @@ static irqreturn_t cht_wc_i2c_adap_thread_handler(int id, void *data) + * interrupt handler as well, so running the client irq handler from + * this thread will cause things to lock up. + */ +- if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) { +- /* +- * generic_handle_irq expects local IRQs to be disabled +- * as normally it is called from interrupt context. +- */ +- local_irq_disable(); +- generic_handle_irq(adap->client_irq); +- local_irq_enable(); +- } ++ if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) ++ generic_handle_irq_safe(adap->client_irq); + + return IRQ_HANDLED; + } +diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c +index 73253e667de1..698f3e928fcf 100644 +--- a/drivers/i2c/i2c-core-base.c ++++ b/drivers/i2c/i2c-core-base.c +@@ -1423,7 +1423,7 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) + if (irq <= 0) + return -ENXIO; + +- generic_handle_irq(irq); ++ generic_handle_irq_safe(irq); + + return 0; + } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c -index 02ed53b20654..e459744c7a0d 100644 +index 9c1a5877cf9f..e748c0e33349 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2217,8 +2217,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) @@ -2367,21 +2077,138 @@ index 5c05acf20e1f..665fe138ab4f 100644 struct page *spare_page; /* Used when checking P/Q in raid6 */ void *scribble; /* space for constructing buffer * lists and performing address -diff --git a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c -index 2473fb5f75e5..2a5cc64227e9 100644 ---- a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c -+++ b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c -@@ -458,7 +458,7 @@ nfp_abm_qdisc_graft(struct nfp_abm_link *alink, u32 handle, u32 child_handle, - static void - nfp_abm_stats_calculate(struct nfp_alink_stats *new, - struct nfp_alink_stats *old, -- struct gnet_stats_basic_packed *bstats, -+ struct gnet_stats_basic_sync *bstats, - struct gnet_stats_queue *qstats) +diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c +index 70fa18b04ad2..b14d3f98e1eb 100644 +--- a/drivers/mfd/ezx-pcap.c ++++ b/drivers/mfd/ezx-pcap.c +@@ -193,13 +193,11 @@ static void pcap_isr_work(struct work_struct *work) + ezx_pcap_write(pcap, PCAP_REG_MSR, isr | msr); + ezx_pcap_write(pcap, PCAP_REG_ISR, isr); + +- local_irq_disable(); + service = isr & ~msr; + for (irq = pcap->irq_base; service; service >>= 1, irq++) { + if (service & 1) +- generic_handle_irq(irq); ++ generic_handle_irq_safe(irq); + } +- local_irq_enable(); + ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); + } while (gpio_get_value(pdata->gpio)); + } +diff --git a/drivers/misc/hi6421v600-irq.c b/drivers/misc/hi6421v600-irq.c +index 1c763796cf1f..caa3de37698b 100644 +--- a/drivers/misc/hi6421v600-irq.c ++++ b/drivers/misc/hi6421v600-irq.c +@@ -117,8 +117,8 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) + * If both powerkey down and up IRQs are received, + * handle them at the right order + */ +- generic_handle_irq(priv->irqs[POWERKEY_DOWN]); +- generic_handle_irq(priv->irqs[POWERKEY_UP]); ++ generic_handle_irq_safe(priv->irqs[POWERKEY_DOWN]); ++ generic_handle_irq_safe(priv->irqs[POWERKEY_UP]); + pending &= ~HISI_IRQ_POWERKEY_UP_DOWN; + } + +@@ -126,7 +126,7 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) + continue; + + for_each_set_bit(offset, &pending, BITS_PER_BYTE) { +- generic_handle_irq(priv->irqs[offset + i * BITS_PER_BYTE]); ++ generic_handle_irq_safe(priv->irqs[offset + i * BITS_PER_BYTE]); + } + } + +diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c +index 90e1bcd03b46..52309b84be88 100644 +--- a/drivers/mmc/core/block.c ++++ b/drivers/mmc/core/block.c +@@ -2051,7 +2051,8 @@ static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req) + mmc_put_card(mq->card, &mq->ctx); + } + +-static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) ++static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req, ++ bool can_sleep) { - _bstats_update(bstats, new->tx_bytes - old->tx_bytes, + struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req); + struct mmc_request *mrq = &mqrq->brq.mrq; +@@ -2063,10 +2064,14 @@ static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) + * Block layer timeouts race with completions which means the normal + * completion path cannot be used during recovery. + */ +- if (mq->in_recovery) ++ if (mq->in_recovery) { + mmc_blk_mq_complete_rq(mq, req); +- else if (likely(!blk_should_fake_timeout(req->q))) +- blk_mq_complete_request(req); ++ } else if (likely(!blk_should_fake_timeout(req->q))) { ++ if (can_sleep) ++ blk_mq_complete_request_direct(req, mmc_blk_mq_complete); ++ else ++ blk_mq_complete_request(req); ++ } + + mmc_blk_mq_dec_in_flight(mq, req); + } +@@ -2087,7 +2092,7 @@ void mmc_blk_mq_recovery(struct mmc_queue *mq) + + mmc_blk_urgent_bkops(mq, mqrq); + +- mmc_blk_mq_post_req(mq, req); ++ mmc_blk_mq_post_req(mq, req, true); + } + + static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, +@@ -2106,7 +2111,7 @@ static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, + if (prev_req) + *prev_req = mq->complete_req; + else +- mmc_blk_mq_post_req(mq, mq->complete_req); ++ mmc_blk_mq_post_req(mq, mq->complete_req, true); + + mq->complete_req = NULL; + +@@ -2178,7 +2183,8 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq) + mq->rw_wait = false; + wake_up(&mq->wait); + +- mmc_blk_mq_post_req(mq, req); ++ /* context unknown */ ++ mmc_blk_mq_post_req(mq, req, false); + } + + static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) +@@ -2238,7 +2244,7 @@ static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq, + err = mmc_start_request(host, &mqrq->brq.mrq); + + if (prev_req) +- mmc_blk_mq_post_req(mq, prev_req); ++ mmc_blk_mq_post_req(mq, prev_req, true); + + if (err) + mq->rw_wait = false; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index 075f8abde5cd..6cf28f688190 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -1367,11 +1367,8 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb) + netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata); + lan78xx_defer_kevent(dev, EVENT_LINK_RESET); + +- if (dev->domain_data.phyirq > 0) { +- local_irq_disable(); +- generic_handle_irq(dev->domain_data.phyirq); +- local_irq_enable(); +- } ++ if (dev->domain_data.phyirq > 0) ++ generic_handle_irq_safe(dev->domain_data.phyirq); + } else { + netdev_warn(dev->net, + "unexpected interrupt: 0x%08x\n", intdata); diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c -index 5ae6c207d3ac..660908027dc5 100644 +index 6415f88738ad..556284ea978b 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1450,11 +1450,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, @@ -2477,8 +2304,24 @@ index 841000445b9a..26d661ddc950 100644 /* peek cache of free slot */ if (pool->left != FC_XID_UNKNOWN) { +diff --git a/drivers/staging/greybus/gpio.c b/drivers/staging/greybus/gpio.c +index 7e6347fe93f9..8a7cf1d0e968 100644 +--- a/drivers/staging/greybus/gpio.c ++++ b/drivers/staging/greybus/gpio.c +@@ -391,10 +391,7 @@ static int gb_gpio_request_handler(struct gb_operation *op) + return -EINVAL; + } + +- local_irq_disable(); +- ret = generic_handle_irq(irq); +- local_irq_enable(); +- ++ ret = generic_handle_irq_safe(irq); + if (ret) + dev_err(dev, "failed to invoke irq handler\n"); + diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h -index 6473361525d1..2321d02e9b7a 100644 +index 6473361525d1..7b1a88934d6f 100644 --- a/drivers/tty/serial/8250/8250.h +++ b/drivers/tty/serial/8250/8250.h @@ -132,12 +132,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) @@ -2495,12 +2338,12 @@ index 6473361525d1..2321d02e9b7a 100644 + is_console = uart_console(port); + + if (is_console) -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); + + serial_out(up, UART_IER, ier); + + if (is_console) -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); +} + +static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) @@ -2517,13 +2360,13 @@ index 6473361525d1..2321d02e9b7a 100644 + clearval = UART_IER_UUE; + + if (is_console) -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); + + prior = serial_port_in(port, UART_IER); + serial_port_out(port, UART_IER, clearval); + + if (is_console) -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); + + return prior; +} @@ -2597,10 +2440,10 @@ index 1ce193daea7f..fad00c0414e3 100644 .device = uart_console_device, .setup = univ8250_console_setup, diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c -index fc65a2293ce9..19a92530040f 100644 +index 9c01c531349d..d9b651290e1c 100644 --- a/drivers/tty/serial/8250/8250_fsl.c +++ b/drivers/tty/serial/8250/8250_fsl.c -@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) +@@ -56,9 +56,18 @@ int fsl8250_handle_irq(struct uart_port *port) /* Stop processing interrupts on input overrun */ if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { @@ -2611,16 +2454,16 @@ index fc65a2293ce9..19a92530040f 100644 + is_console = uart_console(port); + + if (is_console) -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); up->ier = port->serial_in(port, UART_IER); + if (is_console) -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c -index 65402d05eff9..8122645ab05c 100644 +index 65402d05eff9..061d8e4072c6 100644 --- a/drivers/tty/serial/8250/8250_ingenic.c +++ b/drivers/tty/serial/8250/8250_ingenic.c @@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", @@ -2638,15 +2481,15 @@ index 65402d05eff9..8122645ab05c 100644 */ + is_console = uart_console(p); + if (is_console) -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); ier = p->serial_in(p, UART_IER); + if (is_console) -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c -index fb65dc601b23..9af18b5d8296 100644 +index fb65dc601b23..e5032e5abd8e 100644 --- a/drivers/tty/serial/8250/8250_mtk.c +++ b/drivers/tty/serial/8250/8250_mtk.c @@ -218,12 +218,37 @@ static void mtk8250_shutdown(struct uart_port *port) @@ -2662,13 +2505,13 @@ index fb65dc601b23..9af18b5d8296 100644 + is_console = uart_console(port); + + if (is_console) -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier & (~mask)); + + if (is_console) -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); } static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) @@ -2679,18 +2522,18 @@ index fb65dc601b23..9af18b5d8296 100644 + unsigned int ier; + + if (uart_console(port)) -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier | mask); + + if (uart_console(port)) -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c -index ec88b706e882..7774c63ce53d 100644 +index 46e2079ad1aa..49883a0a58a5 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -762,7 +762,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) @@ -2801,9 +2644,9 @@ index ec88b706e882..7774c63ce53d 100644 + + wait_for_xmitr(up, UART_LSR_THRE); + -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); + serial8250_console_putchar_locked(port, ch); -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); +} + /* @@ -2820,7 +2663,7 @@ index ec88b706e882..7774c63ce53d 100644 + unsigned long flags; + unsigned int ier; + -+ console_atomic_lock(flags); ++ printk_cpu_sync_get_irqsave(flags); + + touch_nmi_watchdog(); + @@ -2836,7 +2679,7 @@ index ec88b706e882..7774c63ce53d 100644 + wait_for_xmitr(up, BOTH_EMPTY); + serial8250_set_IER(up, ier); + -+ console_atomic_unlock(flags); ++ printk_cpu_sync_put_irqrestore(flags); +} + /* @@ -2916,6 +2759,18 @@ index ec88b706e882..7774c63ce53d 100644 if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) +diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig +index 8cd11aa63ed5..9b6695bdafc9 100644 +--- a/drivers/tty/serial/8250/Kconfig ++++ b/drivers/tty/serial/8250/Kconfig +@@ -9,6 +9,7 @@ config SERIAL_8250 + depends on !S390 + select SERIAL_CORE + select SERIAL_MCTRL_GPIO if GPIOLIB ++ select HAVE_ATOMIC_CONSOLE + help + This selects whether you want to include the driver for the standard + serial ports. The standard answer is Y. People who might say N diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 52518a606c06..1ca270b9857a 100644 --- a/drivers/tty/serial/amba-pl011.c @@ -2990,6 +2845,19 @@ index 0862941862c8..10970632f0e4 100644 } static int __init +diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c +index 7359c3e80d63..ab4712cc9327 100644 +--- a/drivers/tty/vt/vt.c ++++ b/drivers/tty/vt/vt.c +@@ -3161,7 +3161,7 @@ static struct console vt_console_driver = { + .write = vt_console_print, + .device = vt_console_device, + .unblank = unblank_screen, +- .flags = CON_PRINTBUFFER, ++ .flags = CON_PRINTBUFFER|CON_MIGHT_SLEEP, + .index = -1, + }; + #endif diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c index df5184979b28..d4ad211dce7a 100644 --- a/drivers/virt/acrn/irqfd.c @@ -3192,7 +3060,7 @@ index 6a675652129b..7a972d144b54 100644 return fscache_object_congested(); } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c -index bc267832310c..3176913fae6c 100644 +index b4e565711045..5ef0c106fb9d 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -158,7 +158,7 @@ static int fuse_direntplus_link(struct file *file, @@ -3205,7 +3073,7 @@ index bc267832310c..3176913fae6c 100644 if (!o->nodeid) { /* diff --git a/fs/namei.c b/fs/namei.c -index 1946d9667790..d89890a17f1b 100644 +index 1f9d2187c765..49552c066ce5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1633,7 +1633,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, @@ -3217,7 +3085,7 @@ index 1946d9667790..d89890a17f1b 100644 /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) -@@ -3194,7 +3194,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, +@@ -3192,7 +3192,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; @@ -3227,7 +3095,7 @@ index 1946d9667790..d89890a17f1b 100644 if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); diff --git a/fs/namespace.c b/fs/namespace.c -index db9936562011..8a9c40376d94 100644 +index d3d750635610..92bbf3d86e00 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -343,8 +343,24 @@ int __mnt_want_write(struct vfsmount *m) @@ -3258,10 +3126,10 @@ index db9936562011..8a9c40376d94 100644 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c -index 5b68c44848ca..85a1006e0a85 100644 +index 731d31015b6a..d7c2571391b7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c -@@ -636,7 +636,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, +@@ -638,7 +638,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); @@ -3270,7 +3138,7 @@ index 5b68c44848ca..85a1006e0a85 100644 struct dentry *dentry; struct dentry *alias; struct inode *inode; -@@ -1875,7 +1875,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, +@@ -1860,7 +1860,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { @@ -3302,7 +3170,7 @@ index 5fa11e1aca4c..984f26eb888c 100644 status = -EBUSY; spin_lock(&dentry->d_lock); diff --git a/fs/proc/base.c b/fs/proc/base.c -index 93f2479ef319..3e5c11507c91 100644 +index 24fd5e986cb7..a9a3dd989e3d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ @@ -3313,7 +3181,7 @@ index 93f2479ef319..3e5c11507c91 100644 #include <linux/cn_proc.h> #include <trace/events/oom.h> #include "internal.h" -@@ -2043,7 +2044,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, +@@ -2045,7 +2046,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_hash_and_lookup(dir, &qname); if (!child) { @@ -3348,54 +3216,102 @@ index eceeecf6a5bd..d3e2d81656e0 100644 void do_softirq_own_stack(void); #else static inline void do_softirq_own_stack(void) +diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h +index 2949d9ac7484..131b45dfec67 100644 +--- a/include/linux/blk-mq.h ++++ b/include/linux/blk-mq.h +@@ -752,6 +752,17 @@ static inline void blk_mq_set_request_complete(struct request *rq) + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + } + ++/* ++ * Complete the request directly instead of deferring it to softirq or ++ * completing it another CPU. Useful in preemptible instead of an interrupt. ++ */ ++static inline void blk_mq_complete_request_direct(struct request *rq, ++ void (*complete)(struct request *rq)) ++{ ++ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); ++ complete(rq); ++} ++ + void blk_mq_start_request(struct request *rq); + void blk_mq_end_request(struct request *rq, blk_status_t error); + void __blk_mq_end_request(struct request *rq, blk_status_t error); diff --git a/include/linux/console.h b/include/linux/console.h -index a97f277cfdfa..487a4266ab2c 100644 +index a97f277cfdfa..15432b6e11a4 100644 --- a/include/linux/console.h +++ b/include/linux/console.h -@@ -16,6 +16,13 @@ +@@ -16,6 +16,7 @@ #include <linux/atomic.h> #include <linux/types.h> -+#include <linux/printk.h> -+#include <linux/seqlock.h> -+ -+struct latched_seq { -+ seqcount_latch_t latch; -+ u64 val[2]; -+}; ++#include <linux/mutex.h> struct vc_data; struct console_font_op; -@@ -136,10 +143,12 @@ static inline int con_debug_leave(void) - #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ +@@ -133,13 +134,25 @@ static inline int con_debug_leave(void) + #define CON_CONSDEV (2) /* Preferred console, /dev/console */ + #define CON_ENABLED (4) + #define CON_BOOT (8) +-#define CON_ANYTIME (16) /* Safe to call when cpu is offline */ ++#define CON_ANYTIME (16) /* Safe to call before per-cpu resources ready */ #define CON_BRL (32) /* Used for a braille device */ #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ -+#define CON_HANDOVER (128) /* Device was previously a boot console. */ ++#define CON_PAUSED (128) /* Sleep while console is locked */ ++#define CON_MIGHT_SLEEP (256) /* Can only be called from sleepable context */ ++ ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++struct console_atomic_data { ++ u64 seq; ++ char *text; ++ char *ext_text; ++ char *dropped_text; ++}; ++#endif struct console { char name[16]; void (*write)(struct console *, const char *, unsigned); -+ void (*write_atomic)(struct console *co, const char *s, unsigned int count); ++ void (*write_atomic)(struct console *, const char *, unsigned); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); -@@ -149,6 +158,16 @@ struct console { - short flags; - short index; +@@ -151,6 +164,26 @@ struct console { int cflag; -+#ifdef CONFIG_PRINTK -+ char sync_buf[CONSOLE_LOG_MAX]; -+ struct latched_seq printk_seq; -+ struct latched_seq printk_sync_seq; -+#ifdef CONFIG_HAVE_NMI -+ struct latched_seq printk_sync_nmi_seq; -+#endif -+#endif /* CONFIG_PRINTK */ -+ -+ struct task_struct *thread; uint ispeed; uint ospeed; ++ u64 seq; ++ atomic_long_t dropped; ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++ struct console_atomic_data *atomic_data; ++#endif ++ struct task_struct *thread; ++ ++ /* ++ * The per-console lock is used by printing kthreads to synchronize ++ * this console with callers of console_lock(). This is necessary in ++ * order to allow printing kthreads to run in parallel to each other, ++ * while each safely accessing their own @flags and synchronizing ++ * against direct printing via console_lock/console_unlock. ++ * ++ * Note: For synchronizing against direct printing via ++ * console_trylock/console_unlock, see the static global ++ * variable @console_lock_count. ++ */ ++ struct mutex lock; ++ void *data; + struct console *next; + }; +@@ -165,6 +198,7 @@ extern int console_set_on_cmdline; + extern struct console *early_console; + + enum con_flush_mode { ++ CONSOLE_ATOMIC_FLUSH_PENDING, + CONSOLE_FLUSH_PENDING, + CONSOLE_REPLAY_ALL, + }; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9e23d33bb6f1..9f89d4887e35 100644 --- a/include/linux/dcache.h @@ -3439,56 +3355,45 @@ index 2e2b8d6140ed..71064a2c2caf 100644 ARCH_EXIT_TO_USER_MODE_WORK) /** -diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h -index ec2a47a81e42..8cd11a223260 100644 ---- a/include/linux/irq_work.h -+++ b/include/linux/irq_work.h -@@ -3,6 +3,7 @@ - #define _LINUX_IRQ_WORK_H - - #include <linux/smp_types.h> -+#include <linux/rcuwait.h> - - /* - * An entry can be in one of four states: -@@ -16,11 +17,13 @@ - struct irq_work { - struct __call_single_node node; - void (*func)(struct irq_work *); -+ struct rcuwait irqwait; - }; - - #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ - .node = { .u_flags = (_flags), }, \ - .func = (_func), \ -+ .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ - } +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 1f22a30c0963..9c35024be942 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -554,6 +554,22 @@ extern void __raise_softirq_irqoff(unsigned int nr); + extern void raise_softirq_irqoff(unsigned int nr); + extern void raise_softirq(unsigned int nr); - #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) -@@ -46,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) - return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; - } - -+static inline bool irq_work_is_hard(struct irq_work *work) ++#ifdef CONFIG_PREEMPT_RT ++extern void raise_timer_softirq(void); ++extern void raise_hrtimer_softirq(void); ++ ++#else ++static inline void raise_timer_softirq(void) +{ -+ return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; ++ raise_softirq(TIMER_SOFTIRQ); +} + - bool irq_work_queue(struct irq_work *work); - bool irq_work_queue_on(struct irq_work *work, int cpu); ++static inline void raise_hrtimer_softirq(void) ++{ ++ raise_softirq_irqoff(HRTIMER_SOFTIRQ); ++} ++#endif ++ + DECLARE_PER_CPU(struct task_struct *, ksoftirqd); + static inline struct task_struct *this_cpu_ksoftirqd(void) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h -index 59aea39785bf..ab70314af3d5 100644 +index 93d270ca0c56..a77584593f7d 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h -@@ -68,6 +68,7 @@ struct irq_desc { - unsigned int irqs_unhandled; - atomic_t threads_handled; - int threads_handled_last; -+ u64 random_ip; - raw_spinlock_t lock; - struct cpumask *percpu_enabled; - const struct cpumask *percpu_affinity; +@@ -160,6 +160,7 @@ static inline void generic_handle_irq_desc(struct irq_desc *desc) + + int handle_irq_desc(struct irq_desc *desc); + int generic_handle_irq(unsigned int irq); ++int generic_handle_irq_safe(unsigned int irq); + + #ifdef CONFIG_IRQ_DOMAIN + /* diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 600c10da321a..4b140938b03e 100644 --- a/include/linux/irqflags.h @@ -3530,125 +3435,52 @@ index 600c10da321a..4b140938b03e 100644 #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); -diff --git a/include/linux/kernel.h b/include/linux/kernel.h -index 2776423a587e..e8696e4a45aa 100644 ---- a/include/linux/kernel.h -+++ b/include/linux/kernel.h -@@ -111,8 +111,8 @@ static __always_inline void might_resched(void) - #endif /* CONFIG_PREEMPT_* */ - - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP --extern void ___might_sleep(const char *file, int line, int preempt_offset); --extern void __might_sleep(const char *file, int line, int preempt_offset); -+extern void __might_resched(const char *file, int line, unsigned int offsets); -+extern void __might_sleep(const char *file, int line); - extern void __cant_sleep(const char *file, int line, int preempt_offset); - extern void __cant_migrate(const char *file, int line); - -@@ -129,7 +129,7 @@ extern void __cant_migrate(const char *file, int line); - * supposed to. - */ - # define might_sleep() \ -- do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) -+ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) - /** - * cant_sleep - annotation for functions that cannot sleep - * -@@ -168,10 +168,9 @@ extern void __cant_migrate(const char *file, int line); - */ - # define non_block_end() WARN_ON(current->non_block_count-- == 0) - #else -- static inline void ___might_sleep(const char *file, int line, -- int preempt_offset) { } -- static inline void __might_sleep(const char *file, int line, -- int preempt_offset) { } -+ static inline void __might_resched(const char *file, int line, -+ unsigned int offsets) { } -+static inline void __might_sleep(const char *file, int line) { } - # define might_sleep() do { might_resched(); } while (0) - # define cant_sleep() do { } while (0) - # define cant_migrate() do { } while (0) -diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h -index 258cdde8d356..9bca0d98db5a 100644 ---- a/include/linux/kgdb.h -+++ b/include/linux/kgdb.h -@@ -212,6 +212,8 @@ extern void kgdb_call_nmi_hook(void *ignored); - */ - extern void kgdb_roundup_cpus(void); - -+extern void kgdb_roundup_cpu(unsigned int cpu); -+ - /** - * kgdb_arch_set_pc - Generic call back to the program counter - * @regs: Current &struct pt_regs. -@@ -365,5 +367,6 @@ extern void kgdb_free_init_mem(void); - #define dbg_late_init() - static inline void kgdb_panic(const char *msg) {} - static inline void kgdb_free_init_mem(void) { } -+static inline void kgdb_roundup_cpu(unsigned int cpu) {} - #endif /* ! CONFIG_KGDB */ - #endif /* _KGDB_H_ */ -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 3a9a798a4ae1..3ea692eeb8d3 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -12,6 +12,7 @@ - #include <linux/completion.h> - #include <linux/cpumask.h> - #include <linux/uprobes.h> -+#include <linux/rcupdate.h> - #include <linux/page-flags-layout.h> - #include <linux/workqueue.h> - #include <linux/seqlock.h> -@@ -574,6 +575,9 @@ struct mm_struct { - bool tlb_flush_batched; - #endif - struct uprobes_state uprobes_state; -+#ifdef CONFIG_PREEMPT_RT -+ struct rcu_head delayed_drop; -+#endif - #ifdef CONFIG_HUGETLB_PAGE - atomic_long_t hugetlb_usage; - #endif +diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h +index 975e33b793a7..6d635e8306d6 100644 +--- a/include/linux/local_lock_internal.h ++++ b/include/linux/local_lock_internal.h +@@ -44,9 +44,9 @@ static inline void local_lock_debug_init(local_lock_t *l) + } + #else /* CONFIG_DEBUG_LOCK_ALLOC */ + # define LOCAL_LOCK_DEBUG_INIT(lockname) +-static inline void local_lock_acquire(local_lock_t *l) { } +-static inline void local_lock_release(local_lock_t *l) { } +-static inline void local_lock_debug_init(local_lock_t *l) { } ++# define local_lock_acquire(__ll) do { typecheck(local_lock_t *, __ll); } while (0) ++# define local_lock_release(__ll) do { typecheck(local_lock_t *, __ll); } while (0) ++# define local_lock_debug_init(__ll) do { typecheck(local_lock_t *, __ll); } while (0) + #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + + #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h -index ce81cc96a98d..4230c0fe2dcb 100644 +index 6aadcc0ecb5b..4b041364ee2b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h -@@ -1916,7 +1916,6 @@ enum netdev_ml_priv_type { - * @sfp_bus: attached &struct sfp_bus structure. - * - * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock -- * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount - * - * @proto_down: protocol port state information can be sent to the - * switch driver and used to set the phys state of the -@@ -2250,7 +2249,6 @@ struct net_device { - struct phy_device *phydev; - struct sfp_bus *sfp_bus; - struct lock_class_key *qdisc_tx_busylock; -- struct lock_class_key *qdisc_running_key; - bool proto_down; - unsigned wol_enabled:1; - unsigned threaded:1; -@@ -2360,13 +2358,11 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, - #define netdev_lockdep_set_classes(dev) \ - { \ - static struct lock_class_key qdisc_tx_busylock_key; \ -- static struct lock_class_key qdisc_running_key; \ - static struct lock_class_key qdisc_xmit_lock_key; \ - static struct lock_class_key dev_addr_list_lock_key; \ - unsigned int i; \ - \ - (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ -- (dev)->qdisc_running_key = &qdisc_running_key; \ - lockdep_set_class(&(dev)->addr_list_lock, \ - &dev_addr_list_lock_key); \ - for (i = 0; i < (dev)->num_tx_queues; i++) \ +@@ -4003,8 +4003,17 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); + int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); + int netif_rx(struct sk_buff *skb); +-int netif_rx_ni(struct sk_buff *skb); +-int netif_rx_any_context(struct sk_buff *skb); ++ ++static inline int netif_rx_ni(struct sk_buff *skb) ++{ ++ return netif_rx(skb); ++} ++ ++static inline int netif_rx_any_context(struct sk_buff *skb) ++{ ++ return netif_rx(skb); ++} ++ + int netif_receive_skb(struct sk_buff *skb); + int netif_receive_skb_core(struct sk_buff *skb); + void netif_receive_skb_list(struct list_head *head); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h -index e9698b6278a5..1c8393c1280c 100644 +index 967a0098f0a9..57979c3dc4a7 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h -@@ -1692,7 +1692,7 @@ struct nfs_unlinkdata { +@@ -1684,7 +1684,7 @@ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; @@ -3658,23 +3490,10 @@ index e9698b6278a5..1c8393c1280c 100644 struct nfs_fattr dir_attr; long timeout; diff --git a/include/linux/preempt.h b/include/linux/preempt.h -index 4d244e295e85..3da73c968211 100644 +index b4381f255a5c..c05c5247986f 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h -@@ -122,9 +122,10 @@ - * The preempt_count offset after spin_lock() - */ - #if !defined(CONFIG_PREEMPT_RT) --#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET -+#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET - #else --#define PREEMPT_LOCK_OFFSET 0 -+/* Locks on RT do not disable preemption */ -+#define PREEMPT_LOCK_OFFSET 0 - #endif - - /* -@@ -174,6 +175,20 @@ extern void preempt_count_sub(int val); +@@ -196,6 +196,20 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) @@ -3695,7 +3514,7 @@ index 4d244e295e85..3da73c968211 100644 #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ -@@ -182,13 +197,25 @@ do { \ +@@ -204,13 +218,25 @@ do { \ barrier(); \ } while (0) @@ -3722,7 +3541,7 @@ index 4d244e295e85..3da73c968211 100644 #define preemptible() (preempt_count() == 0 && !irqs_disabled()) -@@ -213,6 +240,18 @@ do { \ +@@ -235,6 +261,18 @@ do { \ __preempt_schedule(); \ } while (0) @@ -3741,7 +3560,7 @@ index 4d244e295e85..3da73c968211 100644 #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ -@@ -220,6 +259,12 @@ do { \ +@@ -242,6 +280,12 @@ do { \ preempt_count_dec(); \ } while (0) @@ -3754,7 +3573,7 @@ index 4d244e295e85..3da73c968211 100644 #define preempt_enable_notrace() \ do { \ barrier(); \ -@@ -258,8 +303,12 @@ do { \ +@@ -280,8 +324,12 @@ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() @@ -3767,7 +3586,7 @@ index 4d244e295e85..3da73c968211 100644 #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE -@@ -278,7 +327,7 @@ do { \ +@@ -300,7 +348,7 @@ do { \ } while (0) #define preempt_fold_need_resched() \ do { \ @@ -3776,7 +3595,7 @@ index 4d244e295e85..3da73c968211 100644 set_preempt_need_resched(); \ } while (0) -@@ -394,8 +443,15 @@ extern void migrate_enable(void); +@@ -416,8 +464,15 @@ extern void migrate_enable(void); #else @@ -3795,174 +3614,113 @@ index 4d244e295e85..3da73c968211 100644 #endif /* CONFIG_SMP */ diff --git a/include/linux/printk.h b/include/linux/printk.h -index 9497f6b98339..f1b9cd8d11d6 100644 +index 9497f6b98339..6596f02d1f05 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h -@@ -47,6 +47,12 @@ static inline const char *printk_skip_headers(const char *buffer) +@@ -170,6 +170,8 @@ extern void __printk_safe_exit(void); + #define printk_deferred_enter __printk_safe_enter + #define printk_deferred_exit __printk_safe_exit - #define CONSOLE_EXT_LOG_MAX 8192 - -+/* -+ * The maximum size of a record formatted for console printing -+ * (i.e. with the prefix prepended to every line). -+ */ -+#define CONSOLE_LOG_MAX 1024 ++extern bool pr_flush(int timeout_ms, bool reset_on_progress); + - /* printk's without a loglevel use this.. */ - #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT - -@@ -155,20 +161,7 @@ int vprintk(const char *fmt, va_list args); - asmlinkage __printf(1, 2) __cold - int _printk(const char *fmt, ...); - --/* -- * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! -- */ --__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); -- --extern void __printk_safe_enter(void); --extern void __printk_safe_exit(void); --/* -- * The printk_deferred_enter/exit macros are available only as a hack for -- * some code paths that need to defer all printk console printing. Interrupts -- * must be disabled for the deferred duration. -- */ --#define printk_deferred_enter __printk_safe_enter --#define printk_deferred_exit __printk_safe_exit -+bool pr_flush(int timeout_ms, bool reset_on_progress); - /* * Please don't use printk_ratelimit(), because it shares ratelimiting state -@@ -210,18 +203,10 @@ int _printk(const char *s, ...) + * with all other unrelated printk_ratelimit() callsites. Instead use +@@ -224,6 +226,11 @@ static inline void printk_deferred_exit(void) { - return 0; } --static inline __printf(1, 2) __cold --int _printk_deferred(const char *s, ...) --{ -- return 0; --} -- --static inline void printk_deferred_enter(void) --{ --} --static inline void printk_deferred_exit(void) +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) - { ++{ + return true; - } - ++} ++ static inline int printk_ratelimit(void) -@@ -284,17 +269,30 @@ static inline void printk_trigger_flush(void) - extern int __printk_cpu_trylock(void); - extern void __printk_wait_on_cpu_lock(void); - extern void __printk_cpu_unlock(void); -+extern bool kgdb_roundup_delay(unsigned int cpu); + { + return 0; +@@ -281,45 +288,45 @@ static inline void printk_trigger_flush(void) + #endif + + #ifdef CONFIG_SMP +-extern int __printk_cpu_trylock(void); +-extern void __printk_wait_on_cpu_lock(void); +-extern void __printk_cpu_unlock(void); ++extern int __printk_cpu_sync_try_get(void); ++extern void __printk_cpu_sync_wait(void); ++extern void __printk_cpu_sync_put(void); + +#else + -+#define __printk_cpu_trylock() 1 -+#define __printk_wait_on_cpu_lock() -+#define __printk_cpu_unlock() -+ -+static inline bool kgdb_roundup_delay(unsigned int cpu) -+{ -+ return false; -+} ++#define __printk_cpu_sync_try_get() true ++#define __printk_cpu_sync_wait() ++#define __printk_cpu_sync_put() +#endif /* CONFIG_SMP */ /** - * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning - * lock and disable interrupts. -+ * raw_printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning -+ * lock and disable interrupts. ++ * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk ++ * cpu-reentrant spinning lock. * @flags: Stack-allocated storage for saving local interrupt state, - * to be passed to printk_cpu_unlock_irqrestore(). -+ * to be passed to raw_printk_cpu_unlock_irqrestore(). ++ * to be passed to printk_cpu_sync_put_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. */ -#define printk_cpu_lock_irqsave(flags) \ -+#define raw_printk_cpu_lock_irqsave(flags) \ - for (;;) { \ - local_irq_save(flags); \ - if (__printk_cpu_trylock()) \ -@@ -304,22 +302,30 @@ extern void __printk_cpu_unlock(void); +- for (;;) { \ +- local_irq_save(flags); \ +- if (__printk_cpu_trylock()) \ +- break; \ +- local_irq_restore(flags); \ +- __printk_wait_on_cpu_lock(); \ ++#define printk_cpu_sync_get_irqsave(flags) \ ++ for (;;) { \ ++ local_irq_save(flags); \ ++ if (__printk_cpu_sync_try_get()) \ ++ break; \ ++ local_irq_restore(flags); \ ++ __printk_cpu_sync_wait(); \ } /** - * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning - * lock and restore interrupts. - * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). -+ * raw_printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant -+ * spinning lock and restore interrupts. -+ * @flags: Caller's saved interrupt state from raw_printk_cpu_lock_irqsave(). ++ * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning ++ * lock and restore interrupts. ++ * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave(). */ -#define printk_cpu_unlock_irqrestore(flags) \ -+#define raw_printk_cpu_unlock_irqrestore(flags) \ ++#define printk_cpu_sync_put_irqrestore(flags) \ do { \ - __printk_cpu_unlock(); \ +- __printk_cpu_unlock(); \ ++ __printk_cpu_sync_put(); \ local_irq_restore(flags); \ - } while (0) \ - -#else -+ } while (0) - +- -#define printk_cpu_lock_irqsave(flags) ((void)flags) -#define printk_cpu_unlock_irqrestore(flags) ((void)flags) -+/* -+ * Used to synchronize atomic consoles. -+ * -+ * The same as raw_printk_cpu_lock_irqsave() except that hardware interrupts -+ * are _not_ restored while spinning. -+ */ -+#define console_atomic_lock(flags) \ -+ do { \ -+ local_irq_save(flags); \ -+ while (!__printk_cpu_trylock()) \ -+ cpu_relax(); \ -+ } while (0) - +- -#endif /* CONFIG_SMP */ -+#define console_atomic_unlock raw_printk_cpu_unlock_irqrestore ++ } while (0) extern int kptr_restrict; -@@ -448,8 +454,6 @@ struct pi_entry { - * See the vsnprintf() documentation for format string extensions over C99. - */ - #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__) --#define printk_deferred(fmt, ...) \ -- printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__) - - /** - * pr_emerg - Print an emergency-level message -@@ -587,13 +591,9 @@ struct pi_entry { - #ifdef CONFIG_PRINTK - #define printk_once(fmt, ...) \ - DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__) --#define printk_deferred_once(fmt, ...) \ -- DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__) - #else - #define printk_once(fmt, ...) \ - no_printk(fmt, ##__VA_ARGS__) --#define printk_deferred_once(fmt, ...) \ -- no_printk(fmt, ##__VA_ARGS__) - #endif - - #define pr_emerg_once(fmt, ...) \ diff --git a/include/linux/random.h b/include/linux/random.h -index f45b8be3e3c4..0e41d0527809 100644 +index f45b8be3e3c4..a02c285a5ee5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h -@@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {} +@@ -35,7 +35,8 @@ static inline void add_latent_entropy(void) {} extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; -+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy; ++extern void add_interrupt_randomness(int irq) __latent_entropy; ++extern void process_interrupt_randomness(void); extern void get_random_bytes(void *buf, int nbytes); extern int wait_for_random_bytes(void); @@ -3980,10 +3738,10 @@ index b676aa419eef..c21c7f8103e2 100644 #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) #define DEFAULT_RATELIMIT_BURST 10 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h -index 434d12fe2d4f..de6d1a21f113 100644 +index 5e0beb5c5659..3c61f246966d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h -@@ -94,6 +94,13 @@ void rcu_init_tasks_generic(void); +@@ -95,6 +95,13 @@ void rcu_init_tasks_generic(void); static inline void rcu_init_tasks_generic(void) { } #endif @@ -4024,8 +3782,79 @@ index 9deedfeec2b1..7d049883a08a 100644 extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); +diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h +index 2c0ad417ce3c..8f416c5e929e 100644 +--- a/include/linux/rwlock.h ++++ b/include/linux/rwlock.h +@@ -55,6 +55,12 @@ do { \ + #define write_lock(lock) _raw_write_lock(lock) + #define read_lock(lock) _raw_read_lock(lock) + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) ++#else ++#define write_lock_nested(lock, subclass) _raw_write_lock(lock) ++#endif ++ + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + + #define read_lock_irqsave(lock, flags) \ +diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h +index f1db6f17c4fb..dceb0a59b692 100644 +--- a/include/linux/rwlock_api_smp.h ++++ b/include/linux/rwlock_api_smp.h +@@ -17,6 +17,7 @@ + + void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); + void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); ++void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); + void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); + void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); + void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); +@@ -209,6 +210,13 @@ static inline void __raw_write_lock(rwlock_t *lock) + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); + } + ++static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) ++{ ++ preempt_disable(); ++ rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); ++} ++ + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ + + static inline void __raw_write_unlock(rwlock_t *lock) +diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h +index 49c1f3842ed5..8544ff05e594 100644 +--- a/include/linux/rwlock_rt.h ++++ b/include/linux/rwlock_rt.h +@@ -28,6 +28,7 @@ extern void rt_read_lock(rwlock_t *rwlock); + extern int rt_read_trylock(rwlock_t *rwlock); + extern void rt_read_unlock(rwlock_t *rwlock); + extern void rt_write_lock(rwlock_t *rwlock); ++extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); + extern int rt_write_trylock(rwlock_t *rwlock); + extern void rt_write_unlock(rwlock_t *rwlock); + +@@ -83,6 +84,15 @@ static __always_inline void write_lock(rwlock_t *rwlock) + rt_write_lock(rwlock); + } + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass) ++{ ++ rt_write_lock_nested(rwlock, subclass); ++} ++#else ++#define write_lock_nested(lock, subclass) rt_write_lock(((void)(subclass), (lock))) ++#endif ++ + static __always_inline void write_lock_bh(rwlock_t *rwlock) + { + local_bh_disable(); diff --git a/include/linux/sched.h b/include/linux/sched.h -index c1a927ddec64..4401d0f05cb3 100644 +index 78c351e35fec..971d20337ad3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,12 +118,8 @@ struct task_group; @@ -4041,7 +3870,7 @@ index c1a927ddec64..4401d0f05cb3 100644 /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). -@@ -1084,6 +1080,10 @@ struct task_struct { +@@ -1082,6 +1078,10 @@ struct task_struct { /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; @@ -4052,7 +3881,7 @@ index c1a927ddec64..4401d0f05cb3 100644 unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; -@@ -1730,6 +1730,16 @@ static __always_inline bool is_percpu_thread(void) +@@ -1727,6 +1727,16 @@ static __always_inline bool is_percpu_thread(void) #endif } @@ -4069,7 +3898,7 @@ index c1a927ddec64..4401d0f05cb3 100644 /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ -@@ -2005,6 +2015,118 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) +@@ -1999,6 +2009,118 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } @@ -4188,94 +4017,36 @@ index c1a927ddec64..4401d0f05cb3 100644 /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return -@@ -2039,7 +2161,7 @@ static inline int _cond_resched(void) { return 0; } - #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ +diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h +index d10150587d81..ccd1336aa7f4 100644 +--- a/include/linux/sched/task_stack.h ++++ b/include/linux/sched/task_stack.h +@@ -70,6 +70,7 @@ static inline void *try_get_task_stack(struct task_struct *tsk) + } - #define cond_resched() ({ \ -- ___might_sleep(__FILE__, __LINE__, 0); \ -+ __might_resched(__FILE__, __LINE__, 0); \ - _cond_resched(); \ - }) + extern void put_task_stack(struct task_struct *tsk); ++extern void put_task_stack_sched(struct task_struct *tsk); + #else + static inline void *try_get_task_stack(struct task_struct *tsk) + { +@@ -77,8 +78,17 @@ static inline void *try_get_task_stack(struct task_struct *tsk) + } -@@ -2047,19 +2169,38 @@ extern int __cond_resched_lock(spinlock_t *lock); - extern int __cond_resched_rwlock_read(rwlock_t *lock); - extern int __cond_resched_rwlock_write(rwlock_t *lock); + static inline void put_task_stack(struct task_struct *tsk) {} ++static inline void put_task_stack_sched(struct task_struct *tsk) {} + #endif --#define cond_resched_lock(lock) ({ \ -- ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ -- __cond_resched_lock(lock); \ -+#define MIGHT_RESCHED_RCU_SHIFT 8 -+#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) -+ -+#ifndef CONFIG_PREEMPT_RT -+/* -+ * Non RT kernels have an elevated preempt count due to the held lock, -+ * but are not allowed to be inside a RCU read side critical section -+ */ -+# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET ++#ifdef CONFIG_ARCH_THREAD_STACK_ALLOCATOR ++static inline void task_stack_cleanup(struct task_struct *tsk) {} +#else -+/* -+ * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in -+ * cond_resched*lock() has to take that into account because it checks for -+ * preempt_count() and rcu_preempt_depth(). -+ */ -+# define PREEMPT_LOCK_RESCHED_OFFSETS \ -+ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) ++extern void task_stack_cleanup(struct task_struct *tsk); +#endif + -+#define cond_resched_lock(lock) ({ \ -+ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ -+ __cond_resched_lock(lock); \ - }) - --#define cond_resched_rwlock_read(lock) ({ \ -- __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ -- __cond_resched_rwlock_read(lock); \ -+#define cond_resched_rwlock_read(lock) ({ \ -+ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ -+ __cond_resched_rwlock_read(lock); \ - }) - --#define cond_resched_rwlock_write(lock) ({ \ -- __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ -- __cond_resched_rwlock_write(lock); \ -+#define cond_resched_rwlock_write(lock) ({ \ -+ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ -+ __cond_resched_rwlock_write(lock); \ - }) - - static inline void cond_resched_rcu(void) -diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h -index 5561486fddef..8358352428d4 100644 ---- a/include/linux/sched/mm.h -+++ b/include/linux/sched/mm.h -@@ -49,6 +49,26 @@ static inline void mmdrop(struct mm_struct *mm) - __mmdrop(mm); - } - -+#ifdef CONFIG_PREEMPT_RT -+extern void __mmdrop_delayed(struct rcu_head *rhp); -+ -+/* -+ * Invoked from finish_task_switch(). Delegates the heavy lifting on RT -+ * kernels via RCU. -+ */ -+static inline void mmdrop_sched(struct mm_struct *mm) -+{ -+ /* Provides a full memory barrier. See mmdrop() */ -+ if (atomic_dec_and_test(&mm->mm_count)) -+ call_rcu(&mm->delayed_drop, __mmdrop_delayed); -+} -+#else -+static inline void mmdrop_sched(struct mm_struct *mm) -+{ -+ mmdrop(mm); -+} -+#endif ++void exit_task_stack_account(struct task_struct *tsk); + - /** - * mmget() - Pin the address space associated with a &struct mm_struct. - * @mm: The address space to pin. + #define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) + diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 5db211f43b29..aa011f668705 100644 --- a/include/linux/serial_8250.h @@ -4306,36 +4077,11 @@ index 5db211f43b29..aa011f668705 100644 int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); -diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h -index b8c273af2910..a66f6ddbdd56 100644 ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -297,6 +297,7 @@ struct sk_buff_head { - - __u32 qlen; - spinlock_t lock; -+ raw_spinlock_t raw_lock; - }; - - struct sk_buff; -@@ -1932,6 +1933,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) - __skb_queue_head_init(list); - } - -+static inline void skb_queue_head_init_raw(struct sk_buff_head *list) -+{ -+ raw_spin_lock_init(&list->raw_lock); -+ __skb_queue_head_init(list); -+} -+ - static inline void skb_queue_head_init_class(struct sk_buff_head *list, - struct lock_class_key *class) - { diff --git a/include/linux/smp.h b/include/linux/smp.h -index 510519e8a1eb..7ac9fdb5ad09 100644 +index a80ab58ae3f1..dd3441d8af44 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h -@@ -268,6 +268,9 @@ static inline int get_boot_cpu_id(void) +@@ -267,6 +267,9 @@ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() @@ -4345,6 +4091,18 @@ index 510519e8a1eb..7ac9fdb5ad09 100644 /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: +diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h +index d0d188861ad6..b8ba00ccccde 100644 +--- a/include/linux/spinlock_api_up.h ++++ b/include/linux/spinlock_api_up.h +@@ -59,6 +59,7 @@ + #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) + #define _raw_read_lock(lock) __LOCK(lock) + #define _raw_write_lock(lock) __LOCK(lock) ++#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) + #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) + #define _raw_read_lock_bh(lock) __LOCK_BH(lock) + #define _raw_write_lock_bh(lock) __LOCK_BH(lock) diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h index c09b6407ae1b..7f86a2016ac5 100644 --- a/include/linux/spinlock_types_up.h @@ -4358,38 +4116,8 @@ index c09b6407ae1b..7f86a2016ac5 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/suspend.h b/include/linux/suspend.h -index 8af13ba60c7e..79b6933ef8a0 100644 ---- a/include/linux/suspend.h -+++ b/include/linux/suspend.h -@@ -550,23 +550,17 @@ static inline void unlock_system_sleep(void) {} - #ifdef CONFIG_PM_SLEEP_DEBUG - extern bool pm_print_times_enabled; - extern bool pm_debug_messages_on; --extern __printf(2, 3) void __pm_pr_dbg(bool defer, const char *fmt, ...); -+extern __printf(1, 2) void pm_pr_dbg(const char *fmt, ...); - #else - #define pm_print_times_enabled (false) - #define pm_debug_messages_on (false) - - #include <linux/printk.h> - --#define __pm_pr_dbg(defer, fmt, ...) \ -+#define pm_pr_dbg(fmt, ...) \ - no_printk(KERN_DEBUG fmt, ##__VA_ARGS__) - #endif - --#define pm_pr_dbg(fmt, ...) \ -- __pm_pr_dbg(false, fmt, ##__VA_ARGS__) -- --#define pm_deferred_pr_dbg(fmt, ...) \ -- __pm_pr_dbg(true, fmt, ##__VA_ARGS__) -- - #ifdef CONFIG_PM_AUTOSLEEP - - /* kernel/power/autosleep.c */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h -index 0999f6317978..7af834b7c114 100644 +index ad0c4e041030..3033c8f05298 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -163,7 +163,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) @@ -4412,7 +4140,7 @@ index 0999f6317978..7af834b7c114 100644 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h -index 57113190448c..827725f41149 100644 +index 2d167ac3452c..3f80b9da186e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -69,6 +69,7 @@ struct trace_entry { @@ -4444,7 +4172,7 @@ index 57113190448c..827725f41149 100644 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h -index e81856c0ba13..81dc1f5e181a 100644 +index e8ec116c916b..6ad4e9032d53 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -66,7 +66,7 @@ @@ -4452,35 +4180,11 @@ index e81856c0ba13..81dc1f5e181a 100644 struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) -+#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) seqcount_t seq; #endif }; -@@ -83,6 +83,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) - return local64_read(&p->v); - } - -+static inline void u64_stats_set(u64_stats_t *p, u64 val) -+{ -+ local64_set(&p->v, val); -+} -+ - static inline void u64_stats_add(u64_stats_t *p, unsigned long val) - { - local64_add(val, &p->v); -@@ -104,6 +109,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) - return p->v; - } - -+static inline void u64_stats_set(u64_stats_t *p, u64 val) -+{ -+ p->v = val; -+} -+ - static inline void u64_stats_add(u64_stats_t *p, unsigned long val) - { - p->v += val; -@@ -115,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p) +@@ -125,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p) } #endif @@ -4489,7 +4193,7 @@ index e81856c0ba13..81dc1f5e181a 100644 #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) #else static inline void u64_stats_init(struct u64_stats_sync *syncp) -@@ -125,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) +@@ -135,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { @@ -4511,7 +4215,7 @@ index e81856c0ba13..81dc1f5e181a 100644 #endif } -@@ -142,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +@@ -152,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) { unsigned long flags = 0; @@ -4525,7 +4229,7 @@ index e81856c0ba13..81dc1f5e181a 100644 write_seqcount_begin(&syncp->seq); #endif return flags; -@@ -153,15 +170,18 @@ static inline void +@@ -163,15 +170,18 @@ static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { @@ -4547,7 +4251,7 @@ index e81856c0ba13..81dc1f5e181a 100644 return read_seqcount_begin(&syncp->seq); #else return 0; -@@ -170,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * +@@ -180,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { @@ -4556,7 +4260,7 @@ index e81856c0ba13..81dc1f5e181a 100644 preempt_disable(); #endif return __u64_stats_fetch_begin(syncp); -@@ -179,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy +@@ -189,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { @@ -4565,7 +4269,7 @@ index e81856c0ba13..81dc1f5e181a 100644 return read_seqcount_retry(&syncp->seq, start); #else return false; -@@ -189,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +@@ -199,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { @@ -4574,7 +4278,7 @@ index e81856c0ba13..81dc1f5e181a 100644 preempt_enable(); #endif return __u64_stats_fetch_retry(syncp, start); -@@ -203,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +@@ -213,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { @@ -4585,7 +4289,7 @@ index e81856c0ba13..81dc1f5e181a 100644 local_irq_disable(); #endif return __u64_stats_fetch_begin(syncp); -@@ -212,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync +@@ -222,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { @@ -4596,400 +4300,58 @@ index e81856c0ba13..81dc1f5e181a 100644 local_irq_enable(); #endif return __u64_stats_fetch_retry(syncp, start); -diff --git a/include/net/act_api.h b/include/net/act_api.h -index f19f7f4a463c..b5b624c7e488 100644 ---- a/include/net/act_api.h -+++ b/include/net/act_api.h -@@ -30,13 +30,13 @@ struct tc_action { - atomic_t tcfa_bindcnt; - int tcfa_action; - struct tcf_t tcfa_tm; -- struct gnet_stats_basic_packed tcfa_bstats; -- struct gnet_stats_basic_packed tcfa_bstats_hw; -+ struct gnet_stats_basic_sync tcfa_bstats; -+ struct gnet_stats_basic_sync tcfa_bstats_hw; - struct gnet_stats_queue tcfa_qstats; - struct net_rate_estimator __rcu *tcfa_rate_est; - spinlock_t tcfa_lock; -- struct gnet_stats_basic_cpu __percpu *cpu_bstats; -- struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw; -+ struct gnet_stats_basic_sync __percpu *cpu_bstats; -+ struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; - struct gnet_stats_queue __percpu *cpu_qstats; - struct tc_cookie __rcu *act_cookie; - struct tcf_chain __rcu *goto_chain; -@@ -206,7 +206,7 @@ static inline void tcf_action_update_bstats(struct tc_action *a, - struct sk_buff *skb) - { - if (likely(a->cpu_bstats)) { -- bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(a->cpu_bstats), skb); - return; - } - spin_lock(&a->tcfa_lock); -diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h -index 1424e02cef90..7aa2b8e1fb29 100644 ---- a/include/net/gen_stats.h -+++ b/include/net/gen_stats.h -@@ -7,14 +7,17 @@ - #include <linux/rtnetlink.h> - #include <linux/pkt_sched.h> - --/* Note: this used to be in include/uapi/linux/gen_stats.h */ --struct gnet_stats_basic_packed { -- __u64 bytes; -- __u64 packets; --}; -- --struct gnet_stats_basic_cpu { -- struct gnet_stats_basic_packed bstats; -+/* Throughput stats. -+ * Must be initialized beforehand with gnet_stats_basic_sync_init(). -+ * -+ * If no reads can ever occur parallel to writes (e.g. stack-allocated -+ * bstats), then the internal stat values can be written to and read -+ * from directly. Otherwise, use _bstats_set/update() for writes and -+ * gnet_stats_add_basic() for reads. -+ */ -+struct gnet_stats_basic_sync { -+ u64_stats_t bytes; -+ u64_stats_t packets; - struct u64_stats_sync syncp; - } __aligned(2 * sizeof(u64)); - -@@ -34,6 +37,7 @@ struct gnet_dump { - struct tc_stats tc_stats; - }; - -+void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b); - int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, - struct gnet_dump *d, int padattr); - -@@ -42,41 +46,38 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, - spinlock_t *lock, struct gnet_dump *d, - int padattr); - --int gnet_stats_copy_basic(const seqcount_t *running, -- struct gnet_dump *d, -- struct gnet_stats_basic_cpu __percpu *cpu, -- struct gnet_stats_basic_packed *b); --void __gnet_stats_copy_basic(const seqcount_t *running, -- struct gnet_stats_basic_packed *bstats, -- struct gnet_stats_basic_cpu __percpu *cpu, -- struct gnet_stats_basic_packed *b); --int gnet_stats_copy_basic_hw(const seqcount_t *running, -- struct gnet_dump *d, -- struct gnet_stats_basic_cpu __percpu *cpu, -- struct gnet_stats_basic_packed *b); -+int gnet_stats_copy_basic(struct gnet_dump *d, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, bool running); -+void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, bool running); -+int gnet_stats_copy_basic_hw(struct gnet_dump *d, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, bool running); - int gnet_stats_copy_rate_est(struct gnet_dump *d, - struct net_rate_estimator __rcu **ptr); - int gnet_stats_copy_queue(struct gnet_dump *d, - struct gnet_stats_queue __percpu *cpu_q, - struct gnet_stats_queue *q, __u32 qlen); --void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, -- const struct gnet_stats_queue __percpu *cpu_q, -- const struct gnet_stats_queue *q, __u32 qlen); -+void gnet_stats_add_queue(struct gnet_stats_queue *qstats, -+ const struct gnet_stats_queue __percpu *cpu_q, -+ const struct gnet_stats_queue *q); - int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); - - int gnet_stats_finish_copy(struct gnet_dump *d); - --int gen_new_estimator(struct gnet_stats_basic_packed *bstats, -- struct gnet_stats_basic_cpu __percpu *cpu_bstats, -+int gen_new_estimator(struct gnet_stats_basic_sync *bstats, -+ struct gnet_stats_basic_sync __percpu *cpu_bstats, - struct net_rate_estimator __rcu **rate_est, - spinlock_t *lock, -- seqcount_t *running, struct nlattr *opt); -+ bool running, struct nlattr *opt); - void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); --int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, -- struct gnet_stats_basic_cpu __percpu *cpu_bstats, -+int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, -+ struct gnet_stats_basic_sync __percpu *cpu_bstats, - struct net_rate_estimator __rcu **ptr, - spinlock_t *lock, -- seqcount_t *running, struct nlattr *opt); -+ bool running, struct nlattr *opt); - bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); - bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, - struct gnet_stats_rate_est64 *sample); -diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h -index 832ab69efda5..4c3809e141f4 100644 ---- a/include/net/netfilter/xt_rateest.h -+++ b/include/net/netfilter/xt_rateest.h -@@ -6,7 +6,7 @@ - - struct xt_rateest { - /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ -- struct gnet_stats_basic_packed bstats; -+ struct gnet_stats_basic_sync bstats; - spinlock_t lock; - - -diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h -index 83a6d0792180..4a5833108083 100644 ---- a/include/net/pkt_cls.h -+++ b/include/net/pkt_cls.h -@@ -765,7 +765,7 @@ struct tc_cookie { - }; - - struct tc_qopt_offload_stats { -- struct gnet_stats_basic_packed *bstats; -+ struct gnet_stats_basic_sync *bstats; - struct gnet_stats_queue *qstats; - }; - -@@ -885,7 +885,7 @@ struct tc_gred_qopt_offload_params { - }; - - struct tc_gred_qopt_offload_stats { -- struct gnet_stats_basic_packed bstats[MAX_DPs]; -+ struct gnet_stats_basic_sync bstats[MAX_DPs]; - struct gnet_stats_queue qstats[MAX_DPs]; - struct red_stats *xstats[MAX_DPs]; - }; -diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h -index 8c2d611639fc..73c76ffdf803 100644 ---- a/include/net/sch_generic.h -+++ b/include/net/sch_generic.h -@@ -40,6 +40,13 @@ enum qdisc_state_t { - __QDISC_STATE_DRAINING, - }; - -+enum qdisc_state2_t { -+ /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. -+ * Use qdisc_run_begin/end() or qdisc_is_running() instead. -+ */ -+ __QDISC_STATE2_RUNNING, -+}; -+ - #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) - #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) - -@@ -97,7 +104,7 @@ struct Qdisc { - struct netdev_queue *dev_queue; - - struct net_rate_estimator __rcu *rate_est; -- struct gnet_stats_basic_cpu __percpu *cpu_bstats; -+ struct gnet_stats_basic_sync __percpu *cpu_bstats; - struct gnet_stats_queue __percpu *cpu_qstats; - int pad; - refcount_t refcnt; -@@ -107,10 +114,10 @@ struct Qdisc { - */ - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; - struct qdisc_skb_head q; -- struct gnet_stats_basic_packed bstats; -- seqcount_t running; -+ struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - unsigned long state; -+ unsigned long state2; /* must be written under qdisc spinlock */ - struct Qdisc *next_sched; - struct sk_buff_head skb_bad_txq; - -@@ -143,11 +150,15 @@ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) - return NULL; - } - -+/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc -+ * root_lock section, or provide their own memory barriers -- ordering -+ * against qdisc_run_begin/end() atomic bit operations. -+ */ - static inline bool qdisc_is_running(struct Qdisc *qdisc) - { - if (qdisc->flags & TCQ_F_NOLOCK) - return spin_is_locked(&qdisc->seqlock); -- return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; -+ return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); - } - - static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) -@@ -167,6 +178,9 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) - return !READ_ONCE(qdisc->q.qlen); - } - -+/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with -+ * the qdisc root lock acquired. -+ */ - static inline bool qdisc_run_begin(struct Qdisc *qdisc) - { - if (qdisc->flags & TCQ_F_NOLOCK) { -@@ -206,15 +220,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) - * after it releases the lock at the end of qdisc_run_end(). - */ - return spin_trylock(&qdisc->seqlock); -- } else if (qdisc_is_running(qdisc)) { -- return false; - } -- /* Variant of write_seqcount_begin() telling lockdep a trylock -- * was attempted. -- */ -- raw_write_seqcount_begin(&qdisc->running); -- seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); -- return true; -+ return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); - } - - static inline void qdisc_run_end(struct Qdisc *qdisc) -@@ -226,7 +233,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) - &qdisc->state))) - __netif_schedule(qdisc); - } else { -- write_seqcount_end(&qdisc->running); -+ __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); - } - } - -@@ -592,14 +599,6 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) - return qdisc_lock(root); - } +diff --git a/include/trace/events/net.h b/include/trace/events/net.h +index 78c448c6ab4c..032b431b987b 100644 +--- a/include/trace/events/net.h ++++ b/include/trace/events/net.h +@@ -260,13 +260,6 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, + TP_ARGS(skb) + ); --static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) --{ -- struct Qdisc *root = qdisc_root_sleeping(qdisc); +-DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry, - -- ASSERT_RTNL(); -- return &root->running; --} +- TP_PROTO(const struct sk_buff *skb), - - static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) - { - return qdisc->dev_queue->dev; -@@ -849,14 +848,16 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, - return sch->enqueue(skb, sch, to_free); - } - --static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, -+static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, - __u64 bytes, __u32 packets) - { -- bstats->bytes += bytes; -- bstats->packets += packets; -+ u64_stats_update_begin(&bstats->syncp); -+ u64_stats_add(&bstats->bytes, bytes); -+ u64_stats_add(&bstats->packets, packets); -+ u64_stats_update_end(&bstats->syncp); - } +- TP_ARGS(skb) +-); +- + DECLARE_EVENT_CLASS(net_dev_rx_exit_template, --static inline void bstats_update(struct gnet_stats_basic_packed *bstats, -+static inline void bstats_update(struct gnet_stats_basic_sync *bstats, - const struct sk_buff *skb) - { - _bstats_update(bstats, -@@ -864,26 +865,10 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); - } + TP_PROTO(int ret), +@@ -312,13 +305,6 @@ DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, + TP_ARGS(ret) + ); --static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, -- __u64 bytes, __u32 packets) --{ -- u64_stats_update_begin(&bstats->syncp); -- _bstats_update(&bstats->bstats, bytes, packets); -- u64_stats_update_end(&bstats->syncp); --} +-DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit, - --static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, -- const struct sk_buff *skb) --{ -- u64_stats_update_begin(&bstats->syncp); -- bstats_update(&bstats->bstats, skb); -- u64_stats_update_end(&bstats->syncp); --} +- TP_PROTO(int ret), - - static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, - const struct sk_buff *skb) - { -- bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); - } - - static inline void qdisc_bstats_update(struct Qdisc *sch, -@@ -972,10 +957,9 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, - __u32 *backlog) - { - struct gnet_stats_queue qstats = { 0 }; -- __u32 len = qdisc_qlen_sum(sch); - -- __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len); -- *qlen = qstats.qlen; -+ gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); -+ *qlen = qstats.qlen + qdisc_qlen(sch); - *backlog = qstats.backlog; - } - -@@ -1316,7 +1300,7 @@ void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); - struct mini_Qdisc { - struct tcf_proto *filter_list; - struct tcf_block *block; -- struct gnet_stats_basic_cpu __percpu *cpu_bstats; -+ struct gnet_stats_basic_sync __percpu *cpu_bstats; - struct gnet_stats_queue __percpu *cpu_qstats; - struct rcu_head rcu; - }; -@@ -1324,7 +1308,7 @@ struct mini_Qdisc { - static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, - const struct sk_buff *skb) - { -- bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); - } +- TP_ARGS(ret) +-); +- + DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, - static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) + TP_PROTO(int ret), diff --git a/init/Kconfig b/init/Kconfig -index 11f8a845f259..0b8a65ae1d72 100644 +index 4b7bac10c72d..732dbb61ec7e 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -901,7 +901,7 @@ config NUMA_BALANCING - bool "Memory placement aware NUMA scheduler" - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY -- depends on SMP && NUMA && MIGRATION -+ depends on SMP && NUMA && MIGRATION && !PREEMPT_RT - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -938,6 +938,7 @@ config PAGE_COUNTER - - config MEMCG - bool "Memory controller" -+ depends on !PREEMPT_RT - select PAGE_COUNTER - select EVENTFD - help -@@ -1896,6 +1897,7 @@ choice +@@ -1542,6 +1542,10 @@ config PRINTK + very difficult to diagnose system problems, saying N here is + strongly discouraged. - config SLAB - bool "SLAB" -+ depends on !PREEMPT_RT - select HAVE_HARDENED_USERCOPY_ALLOCATOR - help - The regular slab allocator that is established and known to work -@@ -1916,6 +1918,7 @@ config SLUB - config SLOB - depends on EXPERT - bool "SLOB (Simple Allocator)" -+ depends on !PREEMPT_RT - help - SLOB replaces the stock allocator with a drastically simpler - allocator. SLOB is generally more space efficient but ++config HAVE_ATOMIC_CONSOLE ++ bool ++ default n ++ + config BUG + bool "BUG() support" if EXPERT + default y diff --git a/init/main.c b/init/main.c -index bcd132d4e7bd..af4c7f963955 100644 +index bb984ed79de0..eb30d1f729e9 100644 --- a/init/main.c +++ b/init/main.c -@@ -1604,6 +1604,7 @@ static noinline void __init kernel_init_freeable(void) +@@ -1597,6 +1597,7 @@ static noinline void __init kernel_init_freeable(void) rcu_init_tasks_generic(); do_pre_smp_initcalls(); @@ -4998,7 +4360,7 @@ index bcd132d4e7bd..af4c7f963955 100644 smp_init(); diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index 5876e30c5740..5df0776264c2 100644 +index ce77f0265660..5d3e650cdf48 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,5 +1,11 @@ @@ -5010,9 +4372,9 @@ index 5876e30c5740..5df0776264c2 100644 +config PREEMPT_LAZY + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT + - choice - prompt "Preemption Model" - default PREEMPT_NONE + config PREEMPT_NONE_BUILD + bool + diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1486768f2318..bb3b805436c4 100644 --- a/kernel/cgroup/rstat.c @@ -5037,106 +4399,6 @@ index 1486768f2318..bb3b805436c4 100644 /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || -diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c -index da06a5553835..3e39636da842 100644 ---- a/kernel/debug/debug_core.c -+++ b/kernel/debug/debug_core.c -@@ -238,35 +238,42 @@ NOKPROBE_SYMBOL(kgdb_call_nmi_hook); - static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = - CSD_INIT(kgdb_call_nmi_hook, NULL); - --void __weak kgdb_roundup_cpus(void) -+void __weak kgdb_roundup_cpu(unsigned int cpu) - { - call_single_data_t *csd; -+ int ret; -+ -+ csd = &per_cpu(kgdb_roundup_csd, cpu); -+ -+ /* -+ * If it didn't round up last time, don't try again -+ * since smp_call_function_single_async() will block. -+ * -+ * If rounding_up is false then we know that the -+ * previous call must have at least started and that -+ * means smp_call_function_single_async() won't block. -+ */ -+ if (kgdb_info[cpu].rounding_up) -+ return; -+ kgdb_info[cpu].rounding_up = true; -+ -+ ret = smp_call_function_single_async(cpu, csd); -+ if (ret) -+ kgdb_info[cpu].rounding_up = false; -+} -+NOKPROBE_SYMBOL(kgdb_roundup_cpu); -+ -+void __weak kgdb_roundup_cpus(void) -+{ - int this_cpu = raw_smp_processor_id(); - int cpu; -- int ret; - - for_each_online_cpu(cpu) { - /* No need to roundup ourselves */ - if (cpu == this_cpu) - continue; - -- csd = &per_cpu(kgdb_roundup_csd, cpu); -- -- /* -- * If it didn't round up last time, don't try again -- * since smp_call_function_single_async() will block. -- * -- * If rounding_up is false then we know that the -- * previous call must have at least started and that -- * means smp_call_function_single_async() won't block. -- */ -- if (kgdb_info[cpu].rounding_up) -- continue; -- kgdb_info[cpu].rounding_up = true; -- -- ret = smp_call_function_single_async(cpu, csd); -- if (ret) -- kgdb_info[cpu].rounding_up = false; -+ kgdb_roundup_cpu(cpu); - } - } - NOKPROBE_SYMBOL(kgdb_roundup_cpus); -diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c -index 6735ac36b718..539a2f0dc89d 100644 ---- a/kernel/debug/kdb/kdb_io.c -+++ b/kernel/debug/kdb/kdb_io.c -@@ -559,23 +559,17 @@ static void kdb_msg_write(const char *msg, int msg_len) - cp++; - } - -+ /* mirror output on atomic consoles */ - for_each_console(c) { - if (!(c->flags & CON_ENABLED)) - continue; - if (c == dbg_io_ops->cons) - continue; -- /* -- * Set oops_in_progress to encourage the console drivers to -- * disregard their internal spin locks: in the current calling -- * context the risk of deadlock is a bigger problem than risks -- * due to re-entering the console driver. We operate directly on -- * oops_in_progress rather than using bust_spinlocks() because -- * the calls bust_spinlocks() makes on exit are not appropriate -- * for this calling context. -- */ -- ++oops_in_progress; -- c->write(c, msg, msg_len); -- --oops_in_progress; -+ -+ if (!c->write_atomic) -+ continue; -+ c->write_atomic(c, msg, msg_len); -+ - touch_nmi_watchdog(); - } - } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d5a61d565ad5..a9579f8bf4f0 100644 --- a/kernel/entry/common.c @@ -5170,531 +4432,570 @@ index d5a61d565ad5..a9579f8bf4f0 100644 } } diff --git a/kernel/exit.c b/kernel/exit.c -index 91a43e57a32e..1d099609568d 100644 +index f702a6a63686..383a56795e82 100644 --- a/kernel/exit.c +++ b/kernel/exit.c -@@ -64,6 +64,7 @@ - #include <linux/rcuwait.h> - #include <linux/compat.h> - #include <linux/io_uring.h> -+#include <linux/kprobes.h> - - #include <linux/uaccess.h> - #include <asm/unistd.h> -@@ -168,8 +169,14 @@ static void delayed_put_task_struct(struct rcu_head *rhp) - { - struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); - -+ kprobe_flush_task(tsk); +@@ -171,6 +171,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) + kprobe_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); -+ -+ /* RT enabled kernels delay freeing the VMAP'ed task stack */ -+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) -+ put_task_stack(tsk); -+ ++ task_stack_cleanup(tsk); put_task_struct(tsk); } +@@ -871,6 +872,7 @@ void __noreturn do_exit(long code) + put_page(tsk->task_frag.page); + + validate_creds_for_do_exit(tsk); ++ exit_task_stack_account(tsk); + + check_stack_usage(); + preempt_disable(); diff --git a/kernel/fork.c b/kernel/fork.c -index f3a9cd12011b..0de4abdd7059 100644 +index ae63cce182fd..3e587c69ed26 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -289,7 +289,10 @@ static inline void free_thread_stack(struct task_struct *tsk) - return; - } +@@ -178,13 +178,23 @@ static inline void free_task_struct(struct task_struct *tsk) -- vfree_atomic(tsk->stack); -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) -+ vfree_atomic(tsk->stack); -+ else -+ vfree(tsk->stack); - return; - } - #endif -@@ -705,6 +708,19 @@ void __mmdrop(struct mm_struct *mm) - } - EXPORT_SYMBOL_GPL(__mmdrop); + #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR -+#ifdef CONFIG_PREEMPT_RT -+/* -+ * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is -+ * by far the least expensive way to do that. -+ */ -+void __mmdrop_delayed(struct rcu_head *rhp) ++#define THREAD_STACK_DELAYED_FREE 1UL ++ ++static void thread_stack_mark_delayed_free(struct task_struct *tsk) +{ -+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); ++ unsigned long val = (unsigned long)tsk->stack; + -+ __mmdrop(mm); ++ val |= THREAD_STACK_DELAYED_FREE; ++ WRITE_ONCE(tsk->stack, (void *)val); +} -+#endif + - static void mmdrop_async_fn(struct work_struct *work) - { - struct mm_struct *mm; -diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c -index 221d80c31e94..1543934f26d2 100644 ---- a/kernel/irq/handle.c -+++ b/kernel/irq/handle.c -@@ -190,12 +190,18 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags - - irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) - { -- irqreturn_t retval; -+ struct pt_regs *regs = get_irq_regs(); -+ u64 ip = regs ? instruction_pointer(regs) : 0; - unsigned int flags = 0; -+ irqreturn_t retval; - - retval = __handle_irq_event_percpu(desc, &flags); + /* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ + # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) -- add_interrupt_randomness(desc->irq_data.irq, flags); -+#ifdef CONFIG_PREEMPT_RT -+ desc->random_ip = ip; -+#else -+ add_interrupt_randomness(desc->irq_data.irq, flags, ip); -+#endif +-#ifdef CONFIG_VMAP_STACK ++# ifdef CONFIG_VMAP_STACK + /* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush. Try to minimize the number of calls by caching stacks. +@@ -209,11 +219,35 @@ static int free_vm_stack_cache(unsigned int cpu) - if (!irq_settings_no_debug(desc)) - note_interrupt(desc, retval); -diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c -index 27667e82ecc9..894e4db1fffc 100644 ---- a/kernel/irq/manage.c -+++ b/kernel/irq/manage.c -@@ -1259,6 +1259,8 @@ static int irq_thread(void *data) - irqreturn_t (*handler_fn)(struct irq_desc *desc, - struct irqaction *action); + return 0; + } +-#endif -+ sched_set_fifo(current); +-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) ++static int memcg_charge_kernel_stack(struct vm_struct *vm) + { +-#ifdef CONFIG_VMAP_STACK ++ int i; ++ int ret; + - if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, - &action->thread_flags)) - handler_fn = irq_forced_thread_fn; -@@ -1279,6 +1281,12 @@ static int irq_thread(void *data) - if (action_ret == IRQ_WAKE_THREAD) - irq_wake_secondary(desc, action); ++ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); ++ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); ++ ++ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { ++ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); ++ if (ret) ++ goto err; ++ } ++ return 0; ++err: ++ /* ++ * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is ++ * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will ++ * ignore this page. ++ */ ++ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) ++ memcg_kmem_uncharge_page(vm->pages[i], 0); ++ return ret; ++} ++ ++static int alloc_thread_stack_node(struct task_struct *tsk, int node) ++{ ++ struct vm_struct *vm; + void *stack; + int i; -+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { -+ migrate_disable(); -+ add_interrupt_randomness(action->irq, 0, -+ desc->random_ip ^ (unsigned long) action); -+ migrate_enable(); +@@ -231,9 +265,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) + /* Clear stale pointers from reused stack. */ + memset(s->addr, 0, THREAD_SIZE); + ++ if (memcg_charge_kernel_stack(s)) { ++ vfree(s->addr); ++ return -ENOMEM; + } - wake_threads_waitq(desc); ++ + tsk->stack_vm_area = s; + tsk->stack = s->addr; +- return s->addr; ++ return 0; } -@@ -1424,8 +1432,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) - if (IS_ERR(t)) - return PTR_ERR(t); - -- sched_set_fifo(t); -- /* - * We keep the reference to the task struct even if - * the thread dies to avoid that the interrupt code -@@ -2827,7 +2833,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); - * This call sets the internal irqchip state of an interrupt, - * depending on the value of @which. - * -- * This function should be called with preemption disabled if the -+ * This function should be called with migration disabled if the - * interrupt controller has per-cpu registers. - */ - int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, -diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c -index c481d8458325..02b2daf07441 100644 ---- a/kernel/irq/spurious.c -+++ b/kernel/irq/spurious.c -@@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); - - static int __init irqfixup_setup(char *str) - { -+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { -+ pr_warn("irqfixup boot option not supported with PREEMPT_RT\n"); -+ return 1; -+ } - irqfixup = 1; - printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); - printk(KERN_WARNING "This may impact system performance.\n"); -@@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644); - - static int __init irqpoll_setup(char *str) - { -+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { -+ pr_warn("irqpoll boot option not supported with PREEMPT_RT\n"); -+ return 1; +@@ -246,71 +285,93 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) + THREADINFO_GFP & ~__GFP_ACCOUNT, + PAGE_KERNEL, + 0, node, __builtin_return_address(0)); ++ if (!stack) ++ return -ENOMEM; + ++ vm = find_vm_area(stack); ++ if (memcg_charge_kernel_stack(vm)) { ++ vfree(stack); ++ return -ENOMEM; + } - irqfixup = 2; - printk(KERN_WARNING "Misrouted IRQ fixup and polling support " - "enabled\n"); -diff --git a/kernel/irq_work.c b/kernel/irq_work.c -index db8c248ebc8c..f7df715ec28e 100644 ---- a/kernel/irq_work.c -+++ b/kernel/irq_work.c -@@ -18,11 +18,36 @@ - #include <linux/cpu.h> - #include <linux/notifier.h> - #include <linux/smp.h> -+#include <linux/smpboot.h> - #include <asm/processor.h> - #include <linux/kasan.h> - - static DEFINE_PER_CPU(struct llist_head, raised_list); - static DEFINE_PER_CPU(struct llist_head, lazy_list); -+static DEFINE_PER_CPU(struct task_struct *, irq_workd); -+ -+static void wake_irq_workd(void) -+{ -+ struct task_struct *tsk = __this_cpu_read(irq_workd); -+ -+ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) -+ wake_up_process(tsk); + /* + * We can't call find_vm_area() in interrupt context, and + * free_thread_stack() can be called in interrupt context, + * so cache the vm_struct. + */ +- if (stack) { +- tsk->stack_vm_area = find_vm_area(stack); +- tsk->stack = stack; ++ tsk->stack_vm_area = vm; ++ tsk->stack = stack; ++ return 0; +} + -+#ifdef CONFIG_SMP -+static void irq_work_wake(struct irq_work *entry) ++static void free_thread_stack(struct task_struct *tsk, bool cache_only) +{ -+ wake_irq_workd(); -+} -+ -+static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = -+ IRQ_WORK_INIT_HARD(irq_work_wake); -+#endif ++ int i; + -+static int irq_workd_should_run(unsigned int cpu) -+{ -+ return !llist_empty(this_cpu_ptr(&lazy_list)); -+} - - /* - * Claim the entry so that no one else will poke at it. -@@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void) - /* Enqueue on current CPU, work must already be claimed and preempt disabled */ - static void __irq_work_queue_local(struct irq_work *work) - { -+ struct llist_head *list; -+ bool rt_lazy_work = false; -+ bool lazy_work = false; -+ int work_flags; -+ -+ work_flags = atomic_read(&work->node.a_flags); -+ if (work_flags & IRQ_WORK_LAZY) -+ lazy_work = true; -+ else if (IS_ENABLED(CONFIG_PREEMPT_RT) && -+ !(work_flags & IRQ_WORK_HARD_IRQ)) -+ rt_lazy_work = true; -+ -+ if (lazy_work || rt_lazy_work) -+ list = this_cpu_ptr(&lazy_list); -+ else -+ list = this_cpu_ptr(&raised_list); ++ for (i = 0; i < NR_CACHED_STACKS; i++) { ++ if (this_cpu_cmpxchg(cached_stacks[i], NULL, ++ tsk->stack_vm_area) != NULL) ++ continue; + -+ if (!llist_add(&work->node.llist, list)) ++ tsk->stack = NULL; ++ tsk->stack_vm_area = NULL; + return; + } +- return stack; +-#else ++ if (cache_only) { ++ thread_stack_mark_delayed_free(tsk); ++ return; ++ } + - /* If the work is "lazy", handle it from next tick if any */ -- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { -- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && -- tick_nohz_tick_stopped()) -- arch_irq_work_raise(); -- } else { -- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) -- arch_irq_work_raise(); -- } -+ if (!lazy_work || tick_nohz_tick_stopped()) -+ arch_irq_work_raise(); - } - - /* Enqueue the irq work @work on the current CPU */ -@@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) - if (cpu != smp_processor_id()) { - /* Arch remote IPI send/receive backend aren't NMI safe */ - WARN_ON_ONCE(in_nmi()); -+ -+ /* -+ * On PREEMPT_RT the items which are not marked as -+ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work -+ * item is used on the remote CPU to wake the thread. -+ */ -+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && -+ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { -+ -+ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) -+ goto out; ++ vfree(tsk->stack); ++ tsk->stack = NULL; ++ tsk->stack_vm_area = NULL; ++} + -+ work = &per_cpu(irq_work_wakeup, cpu); -+ if (!irq_work_claim(work)) -+ goto out; -+ } ++# else /* !CONFIG_VMAP_STACK */ + - __smp_call_single_queue(cpu, &work->node.llist); - } else { - __irq_work_queue_local(work); - } -+out: - preempt_enable(); ++static int alloc_thread_stack_node(struct task_struct *tsk, int node) ++{ + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); - return true; - #endif /* CONFIG_SMP */ + if (likely(page)) { + tsk->stack = kasan_reset_tag(page_address(page)); +- return tsk->stack; ++ return 0; + } +- return NULL; +-#endif ++ return -ENOMEM; } -- - bool irq_work_needs_cpu(void) +-static inline void free_thread_stack(struct task_struct *tsk) ++static void free_thread_stack(struct task_struct *tsk, bool cache_only) { - struct llist_head *raised, *lazy; -@@ -160,6 +216,10 @@ void irq_work_single(void *arg) - * else claimed it meanwhile. - */ - (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); -+ -+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || -+ !arch_irq_work_has_interrupt()) -+ rcuwait_wake_up(&work->irqwait); - } - - static void irq_work_run_list(struct llist_head *list) -@@ -167,7 +227,12 @@ static void irq_work_run_list(struct llist_head *list) - struct irq_work *work, *tmp; - struct llist_node *llnode; - -- BUG_ON(!irqs_disabled()); -+ /* -+ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed -+ * in a per-CPU thread in preemptible context. Only the items which are -+ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. -+ */ -+ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); - - if (llist_empty(list)) +-#ifdef CONFIG_VMAP_STACK +- struct vm_struct *vm = task_stack_vm_area(tsk); +- +- if (vm) { +- int i; +- +- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) +- memcg_kmem_uncharge_page(vm->pages[i], 0); +- +- for (i = 0; i < NR_CACHED_STACKS; i++) { +- if (this_cpu_cmpxchg(cached_stacks[i], +- NULL, tsk->stack_vm_area) != NULL) +- continue; +- +- return; +- } +- +- vfree_atomic(tsk->stack); ++ if (cache_only) { ++ thread_stack_mark_delayed_free(tsk); return; -@@ -184,7 +249,10 @@ static void irq_work_run_list(struct llist_head *list) - void irq_work_run(void) - { - irq_work_run_list(this_cpu_ptr(&raised_list)); -- irq_work_run_list(this_cpu_ptr(&lazy_list)); -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) -+ irq_work_run_list(this_cpu_ptr(&lazy_list)); -+ else -+ wake_irq_workd(); + } +-#endif +- + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); ++ tsk->stack = NULL; } - EXPORT_SYMBOL_GPL(irq_work_run); - -@@ -194,7 +262,11 @@ void irq_work_tick(void) - - if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) - irq_work_run_list(raised); -- irq_work_run_list(this_cpu_ptr(&lazy_list)); +-# else + -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) -+ irq_work_run_list(this_cpu_ptr(&lazy_list)); -+ else -+ wake_irq_workd(); ++# endif /* CONFIG_VMAP_STACK */ ++# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ ++ + static struct kmem_cache *thread_stack_cache; + +-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, +- int node) ++static int alloc_thread_stack_node(struct task_struct *tsk, int node) + { + unsigned long *stack; + stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + stack = kasan_reset_tag(stack); + tsk->stack = stack; +- return stack; ++ return stack ? 0 : -ENOMEM; } - /* -@@ -204,8 +276,42 @@ void irq_work_tick(void) - void irq_work_sync(struct irq_work *work) +-static void free_thread_stack(struct task_struct *tsk) ++static void free_thread_stack(struct task_struct *tsk, bool cache_only) { - lockdep_assert_irqs_enabled(); -+ might_sleep(); -+ -+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || -+ !arch_irq_work_has_interrupt()) { -+ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), -+ TASK_UNINTERRUPTIBLE); ++ if (cache_only) { ++ thread_stack_mark_delayed_free(tsk); + return; + } + kmem_cache_free(thread_stack_cache, tsk->stack); ++ tsk->stack = NULL; + } - while (irq_work_is_busy(work)) - cpu_relax(); + void thread_stack_cache_init(void) +@@ -320,8 +381,36 @@ void thread_stack_cache_init(void) + THREAD_SIZE, NULL); + BUG_ON(thread_stack_cache == NULL); } - EXPORT_SYMBOL_GPL(irq_work_sync); +-# endif +-#endif ++ ++# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ + -+static void run_irq_workd(unsigned int cpu) ++void task_stack_cleanup(struct task_struct *tsk) +{ -+ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ unsigned long val = (unsigned long)tsk->stack; ++ ++ if (!(val & THREAD_STACK_DELAYED_FREE)) ++ return; ++ ++ WRITE_ONCE(tsk->stack, (void *)(val & ~THREAD_STACK_DELAYED_FREE)); ++ free_thread_stack(tsk, false); +} + -+static void irq_workd_setup(unsigned int cpu) ++#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ ++static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ -+ sched_set_fifo_low(current); -+} ++ unsigned long *stack; + -+static struct smp_hotplug_thread irqwork_threads = { -+ .store = &irq_workd, -+ .setup = irq_workd_setup, -+ .thread_should_run = irq_workd_should_run, -+ .thread_fn = run_irq_workd, -+ .thread_comm = "irq_work/%u", -+}; ++ stack = arch_alloc_thread_stack_node(tsk, node); ++ tsk->stack = stack; ++ return stack ? 0 : -ENOMEM; ++} + -+static __init int irq_work_init_threads(void) ++static void free_thread_stack(struct task_struct *tsk, bool cache_only) +{ -+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) -+ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); -+ return 0; ++ arch_free_thread_stack(tsk); +} -+early_initcall(irq_work_init_threads); -diff --git a/kernel/kcov.c b/kernel/kcov.c -index 80bfe71bbe13..36ca640c4f8e 100644 ---- a/kernel/kcov.c -+++ b/kernel/kcov.c -@@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); - - struct kcov_percpu_data { - void *irq_area; -+ local_lock_t lock; - - unsigned int saved_mode; - unsigned int saved_size; -@@ -96,7 +97,9 @@ struct kcov_percpu_data { - int saved_sequence; - }; - --static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); -+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = { -+ .lock = INIT_LOCAL_LOCK(lock), -+}; - - /* Must be called with kcov_remote_lock locked. */ - static struct kcov_remote *kcov_remote_find(u64 handle) -@@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle) - if (!in_task() && !in_serving_softirq()) - return; ++ ++#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ -- local_irq_save(flags); -+ local_lock_irqsave(&kcov_percpu_data.lock, flags); + /* SLAB cache for signal_struct structures (tsk->signal) */ + static struct kmem_cache *signal_cachep; +@@ -376,70 +465,55 @@ void vm_area_free(struct vm_area_struct *vma) - /* - * Check that kcov_remote_start() is not called twice in background -@@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle) - */ - mode = READ_ONCE(t->kcov_mode); - if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); - return; - } - /* -@@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle) - * happened while collecting coverage from a background thread. - */ - if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); - return; - } + static void account_kernel_stack(struct task_struct *tsk, int account) + { +- void *stack = task_stack_page(tsk); +- struct vm_struct *vm = task_stack_vm_area(tsk); +- +- if (vm) { ++ if (IS_ENABLED(CONFIG_VMAP_STACK)) { ++ struct vm_struct *vm = task_stack_vm_area(tsk); + int i; - spin_lock(&kcov_remote_lock); - remote = kcov_remote_find(handle); - if (!remote) { -- spin_unlock_irqrestore(&kcov_remote_lock, flags); -+ spin_unlock(&kcov_remote_lock); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); - return; - } - kcov_debug("handle = %llx, context: %s\n", handle, -@@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle) - size = CONFIG_KCOV_IRQ_AREA_SIZE; - area = this_cpu_ptr(&kcov_percpu_data)->irq_area; - } -- spin_unlock_irqrestore(&kcov_remote_lock, flags); -+ spin_unlock(&kcov_remote_lock); - - /* Can only happen when in_task(). */ - if (!area) { -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); - area = vmalloc(size * sizeof(unsigned long)); - if (!area) { - kcov_put(kcov); - return; - } -+ local_lock_irqsave(&kcov_percpu_data.lock, flags); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + account * (PAGE_SIZE / 1024)); + } else { ++ void *stack = task_stack_page(tsk); ++ + /* All stack pages are in the same node. */ + mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } + } -- local_irq_save(flags); +-static int memcg_charge_kernel_stack(struct task_struct *tsk) ++void exit_task_stack_account(struct task_struct *tsk) + { +-#ifdef CONFIG_VMAP_STACK +- struct vm_struct *vm = task_stack_vm_area(tsk); +- int ret; - - /* Reset coverage size. */ - *(u64 *)area = 0; +- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); ++ account_kernel_stack(tsk, -1); + +- if (vm) { ++ if (IS_ENABLED(CONFIG_VMAP_STACK)) { ++ struct vm_struct *vm; + int i; -@@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle) +- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); +- +- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { +- /* +- * If memcg_kmem_charge_page() fails, page's +- * memory cgroup pointer is NULL, and +- * memcg_kmem_uncharge_page() in free_thread_stack() +- * will ignore this page. +- */ +- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, +- 0); +- if (ret) +- return ret; +- } ++ vm = task_stack_vm_area(tsk); ++ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) ++ memcg_kmem_uncharge_page(vm->pages[i], 0); } - kcov_start(t, kcov, size, area, mode, sequence); +-#endif +- return 0; + } -- local_irq_restore(flags); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); +-static void release_task_stack(struct task_struct *tsk) ++static void release_task_stack(struct task_struct *tsk, bool cache_only) + { + if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) + return; /* Better to leak the stack than to free prematurely */ +- account_kernel_stack(tsk, -1); +- free_thread_stack(tsk); +- tsk->stack = NULL; +-#ifdef CONFIG_VMAP_STACK +- tsk->stack_vm_area = NULL; +-#endif ++ free_thread_stack(tsk, cache_only); } - EXPORT_SYMBOL(kcov_remote_start); -@@ -965,12 +969,12 @@ void kcov_remote_stop(void) - if (!in_task() && !in_serving_softirq()) - return; -- local_irq_save(flags); -+ local_lock_irqsave(&kcov_percpu_data.lock, flags); + #ifdef CONFIG_THREAD_INFO_IN_TASK + void put_task_stack(struct task_struct *tsk) + { + if (refcount_dec_and_test(&tsk->stack_refcount)) +- release_task_stack(tsk); ++ release_task_stack(tsk, false); ++} ++ ++void put_task_stack_sched(struct task_struct *tsk) ++{ ++ if (refcount_dec_and_test(&tsk->stack_refcount)) ++ release_task_stack(tsk, true); + } + #endif - mode = READ_ONCE(t->kcov_mode); - barrier(); - if (!kcov_mode_enabled(mode)) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); - return; - } - /* -@@ -978,12 +982,12 @@ void kcov_remote_stop(void) - * actually found the remote handle and started collecting coverage. +@@ -453,7 +527,7 @@ void free_task(struct task_struct *tsk) + * The task is finally done with both the stack and thread_info, + * so free both. */ - if (in_serving_softirq() && !t->kcov_softirq) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); - return; - } - /* Make sure that kcov_softirq is only set when in softirq. */ - if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { -- local_irq_restore(flags); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); - return; - } +- release_task_stack(tsk); ++ release_task_stack(tsk, false); + #else + /* + * If the task had a separate stack allocation, it should be gone +@@ -873,8 +947,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) + static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + { + struct task_struct *tsk; +- unsigned long *stack; +- struct vm_struct *stack_vm_area __maybe_unused; + int err; -@@ -1013,7 +1017,7 @@ void kcov_remote_stop(void) - spin_unlock(&kcov_remote_lock); - } + if (node == NUMA_NO_NODE) +@@ -883,32 +955,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + if (!tsk) + return NULL; -- local_irq_restore(flags); -+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); +- stack = alloc_thread_stack_node(tsk, node); +- if (!stack) ++ err = arch_dup_task_struct(tsk, orig); ++ if (err) + goto free_tsk; - /* Get in kcov_remote_start(). */ - kcov_put(kcov); -@@ -1034,8 +1038,8 @@ static int __init kcov_init(void) - int cpu; +- if (memcg_charge_kernel_stack(tsk)) +- goto free_stack; +- +- stack_vm_area = task_stack_vm_area(tsk); +- +- err = arch_dup_task_struct(tsk, orig); ++ err = alloc_thread_stack_node(tsk, node); ++ if (err) ++ goto free_tsk; - for_each_possible_cpu(cpu) { -- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * -- sizeof(unsigned long)); -+ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE * -+ sizeof(unsigned long), cpu_to_node(cpu)); - if (!area) - return -ENOMEM; - per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; -diff --git a/kernel/kprobes.c b/kernel/kprobes.c -index 2ef90d15699f..2ab883d856b5 100644 ---- a/kernel/kprobes.c -+++ b/kernel/kprobes.c -@@ -1250,10 +1250,10 @@ void kprobe_busy_end(void) - } +- /* +- * arch_dup_task_struct() clobbers the stack-related fields. Make +- * sure they're properly initialized before using any stack-related +- * functions again. +- */ +- tsk->stack = stack; +-#ifdef CONFIG_VMAP_STACK +- tsk->stack_vm_area = stack_vm_area; +-#endif + #ifdef CONFIG_THREAD_INFO_IN_TASK + refcount_set(&tsk->stack_refcount, 1); + #endif +- +- if (err) +- goto free_stack; ++ account_kernel_stack(tsk, 1); - /* -- * This function is called from finish_task_switch when task tk becomes dead, -- * so that we can recycle any function-return probe instances associated -- * with this task. These left over instances represent probed functions -- * that have been called but will never return. -+ * This function is called from delayed_put_task_struct() when a task is -+ * dead and cleaned up to recycle any function-return probe instances -+ * associated with this task. These left over instances represent probed -+ * functions that have been called but will never return. + err = scs_prepare(tsk, node); + if (err) +@@ -952,8 +1010,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + tsk->wake_q.next = NULL; + tsk->pf_io_worker = NULL; + +- account_kernel_stack(tsk, 1); +- + kcov_task_init(tsk); + kmap_local_fork(tsk); + +@@ -972,7 +1028,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + return tsk; + + free_stack: +- free_thread_stack(tsk); ++ exit_task_stack_account(tsk); ++ free_thread_stack(tsk, false); + free_tsk: + free_task_struct(tsk); + return NULL; +@@ -2468,6 +2525,7 @@ static __latent_entropy struct task_struct *copy_process( + exit_creds(p); + bad_fork_free: + WRITE_ONCE(p->__state, TASK_DEAD); ++ exit_task_stack_account(p); + put_task_stack(p); + delayed_free_task(p); + fork_out: +diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c +index f895265d7548..c09324663088 100644 +--- a/kernel/irq/chip.c ++++ b/kernel/irq/chip.c +@@ -575,8 +575,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq); */ - void kprobe_flush_task(struct task_struct *tk) + void handle_untracked_irq(struct irq_desc *desc) + { +- unsigned int flags = 0; +- + raw_spin_lock(&desc->lock); + + if (!irq_may_run(desc)) +@@ -593,7 +591,7 @@ void handle_untracked_irq(struct irq_desc *desc) + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock(&desc->lock); + +- __handle_irq_event_percpu(desc, &flags); ++ __handle_irq_event_percpu(desc); + + raw_spin_lock(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); +diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c +index 27182003b879..9489f93b3db3 100644 +--- a/kernel/irq/handle.c ++++ b/kernel/irq/handle.c +@@ -136,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) + wake_up_process(action->thread); + } + +-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) ++irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { + irqreturn_t retval = IRQ_NONE; + unsigned int irq = desc->irq_data.irq; +@@ -174,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags + } + + __irq_wake_thread(desc, action); +- +- fallthrough; /* to add to randomness */ +- case IRQ_HANDLED: +- *flags |= action->flags; + break; + + default: +@@ -193,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags + irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) + { + irqreturn_t retval; +- unsigned int flags = 0; + +- retval = __handle_irq_event_percpu(desc, &flags); ++ retval = __handle_irq_event_percpu(desc); + +- add_interrupt_randomness(desc->irq_data.irq, flags); ++ add_interrupt_randomness(desc->irq_data.irq); + + if (!irq_settings_no_debug(desc)) + note_interrupt(desc, retval); +diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h +index 54363527feea..99cbdf55a8bd 100644 +--- a/kernel/irq/internals.h ++++ b/kernel/irq/internals.h +@@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data, + + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); + +-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); ++irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); + irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); + irqreturn_t handle_irq_event(struct irq_desc *desc); + +diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c +index 2267e6527db3..97223df2f460 100644 +--- a/kernel/irq/irqdesc.c ++++ b/kernel/irq/irqdesc.c +@@ -662,6 +662,27 @@ int generic_handle_irq(unsigned int irq) + } + EXPORT_SYMBOL_GPL(generic_handle_irq); + ++/** ++ * generic_handle_irq_safe - Invoke the handler for a particular irq ++ * @irq: The irq number to handle ++ * ++ * Returns: 0 on success, or -EINVAL if conversion has failed ++ * ++ * This function must be called either from an IRQ context with irq regs ++ * initialized or with care from any context. ++ */ ++int generic_handle_irq_safe(unsigned int irq) ++{ ++ unsigned long flags; ++ int ret; ++ ++ local_irq_save(flags); ++ ret = handle_irq_desc(irq_to_desc(irq)); ++ local_irq_restore(flags); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(generic_handle_irq_safe); ++ + #ifdef CONFIG_IRQ_DOMAIN + /** + * generic_handle_domain_irq - Invoke the handler for a HW irq belonging +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 7405e384e5ed..d641de1f879f 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -1281,6 +1281,9 @@ static int irq_thread(void *data) + if (action_ret == IRQ_WAKE_THREAD) + irq_wake_secondary(desc, action); + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ process_interrupt_randomness(); ++ + wake_threads_waitq(desc); + } + diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 35859da8bd4f..dfff31ed644a 100644 --- a/kernel/ksysfs.c @@ -5725,59 +5026,11 @@ index 35859da8bd4f..dfff31ed644a 100644 #endif NULL }; -diff --git a/kernel/kthread.c b/kernel/kthread.c -index 5b37a8567168..4a4d7092a2d8 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); - - static int kthread(void *_create) - { -+ static const struct sched_param param = { .sched_priority = 0 }; - /* Copy data: it's on kthread's stack */ - struct kthread_create_info *create = _create; - int (*threadfn)(void *data) = create->threadfn; -@@ -300,6 +301,13 @@ static int kthread(void *_create) - init_completion(&self->parked); - current->vfork_done = &self->exited; - -+ /* -+ * The new thread inherited kthreadd's priority and CPU mask. Reset -+ * back to default in case they have been changed. -+ */ -+ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); -+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); -+ - /* OK, tell user we're spawned, wait for stop or wakeup */ - __set_current_state(TASK_UNINTERRUPTIBLE); - create->result = current; -@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), - } - task = create->result; - if (!IS_ERR(task)) { -- static const struct sched_param param = { .sched_priority = 0 }; - char name[TASK_COMM_LEN]; - - /* -@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), - */ - vsnprintf(name, sizeof(name), namefmt, args); - set_task_comm(task, name); -- /* -- * root may have changed our (kthreadd's) priority or CPU mask. -- * The kernel thread should not inherit these properties. -- */ -- sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); -- set_cpus_allowed_ptr(task, -- housekeeping_cpumask(HK_FLAG_KTHREAD)); - } - kfree(create); - return task; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index 0627584f7872..5bef051a55c5 100644 +index 49c4d11b0893..c3d5a9cbf54d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c -@@ -5475,6 +5475,7 @@ static noinstr void check_flags(unsigned long flags) +@@ -5487,6 +5487,7 @@ static noinstr void check_flags(unsigned long flags) } } @@ -5785,7 +5038,7 @@ index 0627584f7872..5bef051a55c5 100644 /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only -@@ -5489,6 +5490,7 @@ static noinstr void check_flags(unsigned long flags) +@@ -5501,6 +5502,7 @@ static noinstr void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } @@ -5794,37 +5047,22 @@ index 0627584f7872..5bef051a55c5 100644 if (!debug_locks) print_irqtrace_events(current); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index 6bb116c559b4..3665583361c0 100644 +index 1f25a4d7de27..e85d5df3f42c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -1097,8 +1097,26 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, +@@ -1103,8 +1103,11 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, + * the other will detect the deadlock and return -EDEADLOCK, * which is wrong, as the other waiter is not in a deadlock * situation. ++ * ++ * Except for ww_mutex, in that case the chain walk must already deal ++ * with spurious cycles, see the comments at [3] and [6]. */ - if (owner == task) -+ if (owner == task) { -+#if defined(DEBUG_WW_MUTEXES) && defined(CONFIG_DEBUG_LOCKING_API_SELFTESTS) -+ /* -+ * The lockdep selftest for ww-mutex assumes in a few cases -+ * the ww_ctx->contending_lock assignment via -+ * __ww_mutex_check_kill() which does not happen if the rtmutex -+ * detects the deadlock early. -+ */ -+ if (build_ww_mutex() && ww_ctx) { -+ struct rt_mutex *rtm; -+ -+ /* Check whether the waiter should backout immediately */ -+ rtm = container_of(lock, struct rt_mutex, rtmutex); -+ -+ __ww_mutex_add_waiter(waiter, rtm, ww_ctx); -+ __ww_mutex_check_kill(rtm, waiter, ww_ctx); -+ } -+#endif ++ if (owner == task && !(build_ww_mutex() && ww_ctx)) return -EDEADLK; -+ } raw_spin_lock(&task->pi_lock); - waiter->task = task; diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 5c9299aaabae..900220941caa 100644 --- a/kernel/locking/rtmutex_api.c @@ -5898,56 +5136,51 @@ index 5c9299aaabae..900220941caa 100644 /** * rt_mutex_trylock - try to lock a rt_mutex * -diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c -index d2912e44d61f..9e396a09fe0f 100644 ---- a/kernel/locking/spinlock_rt.c -+++ b/kernel/locking/spinlock_rt.c -@@ -24,6 +24,17 @@ - #define RT_MUTEX_BUILD_SPINLOCKS - #include "rtmutex.c" - -+/* -+ * __might_resched() skips the state check as rtlocks are state -+ * preserving. Take RCU nesting into account as spin/read/write_lock() can -+ * legitimately nest into an RCU read side critical section. -+ */ -+#define RTLOCK_RESCHED_OFFSETS \ -+ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) +diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c +index b562f9289372..7f49baaa4979 100644 +--- a/kernel/locking/spinlock.c ++++ b/kernel/locking/spinlock.c +@@ -300,6 +300,16 @@ void __lockfunc _raw_write_lock(rwlock_t *lock) + __raw_write_lock(lock); + } + EXPORT_SYMBOL(_raw_write_lock); + -+#define rtlock_might_resched() \ -+ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) ++#ifndef CONFIG_DEBUG_LOCK_ALLOC ++#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) ++#endif + - static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) - { - if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) -@@ -32,7 +43,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) ++void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) ++{ ++ __raw_write_lock_nested(lock, subclass); ++} ++EXPORT_SYMBOL(_raw_write_lock_nested); + #endif - static __always_inline void __rt_spin_lock(spinlock_t *lock) - { -- ___might_sleep(__FILE__, __LINE__, 0); -+ rtlock_might_resched(); - rtlock_lock(&lock->lock); - rcu_read_lock(); - migrate_disable(); -@@ -210,7 +221,7 @@ EXPORT_SYMBOL(rt_write_trylock); + #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c +index b2e553f9255b..48a19ed8486d 100644 +--- a/kernel/locking/spinlock_rt.c ++++ b/kernel/locking/spinlock_rt.c +@@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwlock) + } + EXPORT_SYMBOL(rt_write_lock); - void __sched rt_read_lock(rwlock_t *rwlock) - { -- ___might_sleep(__FILE__, __LINE__, 0); ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) ++{ + rtlock_might_resched(); - rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); - rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); - rcu_read_lock(); -@@ -220,7 +231,7 @@ EXPORT_SYMBOL(rt_read_lock); - - void __sched rt_write_lock(rwlock_t *rwlock) ++ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); ++ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); ++ rcu_read_lock(); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_write_lock_nested); ++#endif ++ + void __sched rt_read_unlock(rwlock_t *rwlock) { -- ___might_sleep(__FILE__, __LINE__, 0); -+ rtlock_might_resched(); - rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); - rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); - rcu_read_lock(); -@@ -246,12 +257,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock) + rwlock_release(&rwlock->dep_map, _RET_IP_); +@@ -257,12 +269,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_unlock); @@ -5960,260 +5193,179 @@ index d2912e44d61f..9e396a09fe0f 100644 #ifdef CONFIG_DEBUG_LOCK_ALLOC void __rt_rwlock_init(rwlock_t *rwlock, const char *name, struct lock_class_key *key) +diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c +index 0e00205cf467..d1473c624105 100644 +--- a/kernel/locking/ww_rt_mutex.c ++++ b/kernel/locking/ww_rt_mutex.c +@@ -26,7 +26,7 @@ int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) + + if (__rt_mutex_trylock(&rtm->rtmutex)) { + ww_mutex_set_context_fastpath(lock, ww_ctx); +- mutex_acquire_nest(&rtm->dep_map, 0, 1, ww_ctx->dep_map, _RET_IP_); ++ mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); + return 1; + } + diff --git a/kernel/panic.c b/kernel/panic.c -index cefd7d82366f..d509c0694af9 100644 +index cefd7d82366f..556665ef1152 100644 --- a/kernel/panic.c +++ b/kernel/panic.c -@@ -178,12 +178,28 @@ static void panic_print_sys_info(void) - void panic(const char *fmt, ...) - { - static char buf[1024]; -+ va_list args2; - va_list args; - long i, i_next = 0, len; - int state = 0; - int old_cpu, this_cpu; - bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; - -+ console_verbose(); -+ pr_emerg("Kernel panic - not syncing:\n"); -+ va_start(args2, fmt); -+ va_copy(args, args2); -+ vprintk(fmt, args2); -+ va_end(args2); -+#ifdef CONFIG_DEBUG_BUGVERBOSE -+ /* -+ * Avoid nested stack-dumping if a panic occurs during oops processing -+ */ -+ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) -+ dump_stack(); -+#endif -+ pr_flush(1000, true); -+ - /* - * Disable local interrupts. This will prevent panic_smp_self_stop - * from deadlocking the first cpu that invokes the panic, since -@@ -214,24 +230,13 @@ void panic(const char *fmt, ...) - if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) +@@ -215,7 +215,6 @@ void panic(const char *fmt, ...) panic_smp_self_stop(); -- console_verbose(); - bust_spinlocks(1); -- va_start(args, fmt); + console_verbose(); +- bust_spinlocks(1); + va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); +@@ -239,6 +238,11 @@ void panic(const char *fmt, ...) + */ + kgdb_panic(buf); - if (len && buf[len - 1] == '\n') - buf[len - 1] = '\0'; - -- pr_emerg("Kernel panic - not syncing: %s\n", buf); --#ifdef CONFIG_DEBUG_BUGVERBOSE -- /* -- * Avoid nested stack-dumping if a panic occurs during oops processing -- */ -- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) -- dump_stack(); --#endif -- ++ /* Use atomic consoles to dump the kernel log. */ ++ console_flush_on_panic(CONSOLE_ATOMIC_FLUSH_PENDING); ++ ++ bust_spinlocks(1); ++ /* - * If kgdb is enabled, give it a chance to run before we stop all - * the other CPUs or else we won't be able to debug processes left -@@ -540,9 +545,11 @@ static u64 oops_id; - - static int init_oops_id(void) - { -+#ifndef CONFIG_PREEMPT_RT - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - else -+#endif - oops_id++; - - return 0; -@@ -553,6 +560,7 @@ static void print_oops_end_marker(void) - { - init_oops_id(); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); -+ pr_flush(1000, true); - } - - /* -diff --git a/kernel/power/main.c b/kernel/power/main.c -index 44169f3081fd..eaa725ca079c 100644 ---- a/kernel/power/main.c -+++ b/kernel/power/main.c -@@ -543,14 +543,13 @@ static int __init pm_debug_messages_setup(char *str) - __setup("pm_debug_messages", pm_debug_messages_setup); - - /** -- * __pm_pr_dbg - Print a suspend debug message to the kernel log. -- * @defer: Whether or not to use printk_deferred() to print the message. -+ * pm_pr_dbg - Print a suspend debug message to the kernel log. - * @fmt: Message format. - * - * The message will be emitted if enabled through the pm_debug_messages - * sysfs attribute. - */ --void __pm_pr_dbg(bool defer, const char *fmt, ...) -+void pm_pr_dbg(const char *fmt, ...) - { - struct va_format vaf; - va_list args; -@@ -563,10 +562,7 @@ void __pm_pr_dbg(bool defer, const char *fmt, ...) - vaf.fmt = fmt; - vaf.va = &args; - -- if (defer) -- printk_deferred(KERN_DEBUG "PM: %pV", &vaf); -- else -- printk(KERN_DEBUG "PM: %pV", &vaf); -+ printk(KERN_DEBUG "PM: %pV", &vaf); - - va_end(args); + * If we have crashed and we have a crash kernel loaded let it handle + * everything else. +@@ -533,26 +537,9 @@ void oops_enter(void) + trigger_all_cpu_backtrace(); } -diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile -index d118739874c0..bc6b856a0ff4 100644 ---- a/kernel/printk/Makefile -+++ b/kernel/printk/Makefile -@@ -1,6 +1,5 @@ - # SPDX-License-Identifier: GPL-2.0-only - obj-y = printk.o --obj-$(CONFIG_PRINTK) += printk_safe.o - obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o - obj-$(CONFIG_PRINTK) += printk_ringbuffer.o - obj-$(CONFIG_PRINTK_INDEX) += index.o -diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h -index 9f3ed2fdb721..de8ab059dd96 100644 ---- a/kernel/printk/internal.h -+++ b/kernel/printk/internal.h -@@ -2,7 +2,6 @@ - /* - * internal.h - printk internal definitions - */ --#include <linux/percpu.h> - - #ifdef CONFIG_PRINTK - -@@ -12,41 +11,6 @@ enum printk_info_flags { - LOG_CONT = 8, /* text is a fragment of a continuation line */ - }; --__printf(4, 0) --int vprintk_store(int facility, int level, -- const struct dev_printk_info *dev_info, -- const char *fmt, va_list args); -- --__printf(1, 0) int vprintk_default(const char *fmt, va_list args); --__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); -- --bool printk_percpu_data_ready(void); -- --#define printk_safe_enter_irqsave(flags) \ -- do { \ -- local_irq_save(flags); \ -- __printk_safe_enter(); \ -- } while (0) -- --#define printk_safe_exit_irqrestore(flags) \ -- do { \ -- __printk_safe_exit(); \ -- local_irq_restore(flags); \ -- } while (0) -- --void defer_console_output(void); -- - u16 printk_parse_prefix(const char *text, int *level, - enum printk_info_flags *flags); --#else -- -/* -- * In !PRINTK builds we still export console_sem -- * semaphore and some of console functions (console_unlock()/etc.), so -- * printk-safe must preserve the existing local IRQ guarantees. +- * 64-bit random ID for oopses: - */ --#define printk_safe_enter_irqsave(flags) local_irq_save(flags) --#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) +-static u64 oops_id; - --static inline bool printk_percpu_data_ready(void) { return false; } - #endif /* CONFIG_PRINTK */ +-static int init_oops_id(void) +-{ +- if (!oops_id) +- get_random_bytes(&oops_id, sizeof(oops_id)); +- else +- oops_id++; +- +- return 0; +-} +-late_initcall(init_oops_id); +- + static void print_oops_end_marker(void) + { +- init_oops_id(); +- pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); ++ pr_warn("---[ end trace %016llx ]---\n", 0ULL); + } + + /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c -index 99221b016c68..0cc8e8acf545 100644 +index 57b132b658e1..20453a3bc429 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c -@@ -44,6 +44,10 @@ +@@ -44,6 +44,7 @@ #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> -+#include <linux/kdb.h> -+#include <linux/kgdb.h> -+#include <linux/kthread.h> +#include <linux/clocksource.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> -@@ -225,19 +229,7 @@ static int nr_ext_console_drivers; - - static int __down_trylock_console_sem(unsigned long ip) - { -- int lock_failed; -- unsigned long flags; -- -- /* -- * Here and in __up_console_sem() we need to be in safe mode, -- * because spindump/WARN/etc from under console ->lock will -- * deadlock in printk()->down_trylock_console_sem() otherwise. -- */ -- printk_safe_enter_irqsave(flags); -- lock_failed = down_trylock(&console_sem); -- printk_safe_exit_irqrestore(flags); -- -- if (lock_failed) -+ if (down_trylock(&console_sem)) - return 1; - mutex_acquire(&console_lock_dep_map, 0, 1, ip); - return 0; -@@ -246,13 +238,9 @@ static int __down_trylock_console_sem(unsigned long ip) - - static void __up_console_sem(unsigned long ip) - { -- unsigned long flags; -- - mutex_release(&console_lock_dep_map, ip); +@@ -214,6 +215,26 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + /* Number of registered extended console drivers. */ + static int nr_ext_console_drivers; -- printk_safe_enter_irqsave(flags); - up(&console_sem); -- printk_safe_exit_irqrestore(flags); - } ++/* ++ * Used to synchronize printing kthreads against direct printing via ++ * console_trylock/console_unlock. ++ * ++ * Values: ++ * -1 = console locked (via trylock), kthreads will not print ++ * 0 = no kthread printing, console not locked (via trylock) ++ * >0 = kthread(s) actively printing ++ * ++ * Note: For synchronizing against direct printing via ++ * console_lock/console_unlock, see the @lock variable in ++ * struct console. ++ */ ++static atomic_t console_lock_count = ATOMIC_INIT(0); ++ ++#define console_excl_trylock() (atomic_cmpxchg(&console_lock_count, 0, -1) == 0) ++#define console_excl_unlock() atomic_cmpxchg(&console_lock_count, -1, 0) ++#define console_printer_tryenter() atomic_inc_unless_negative(&console_lock_count) ++#define console_printer_exit() atomic_dec(&console_lock_count) ++ + /* + * Helper macros to handle lockdep when locking/unlocking console_sem. We use + * macros instead of functions so that _RET_IP_ contains useful information. +@@ -257,19 +278,37 @@ static void __up_console_sem(unsigned long ip) #define up_console_sem() __up_console_sem(_RET_IP_) -@@ -266,11 +254,6 @@ static void __up_console_sem(unsigned long ip) + /* +- * This is used for debugging the mess that is the VT code by +- * keeping track if we have the console semaphore held. It's +- * definitely not the perfect debug tool (we don't know if _WE_ +- * hold it and are racing, but it helps tracking those weird code +- * paths in the console code where we end up in places I want +- * locked without the console semaphore held). ++ * Tracks whether kthread printers are all paused. A value of true implies ++ * that the console is locked via console_lock() or the console is suspended. ++ * Reading and writing to this variable requires holding @console_sem. */ - static int console_locked, console_suspended; +-static int console_locked, console_suspended; ++static bool consoles_paused; --/* + /* - * If exclusive_console is non-NULL then only this console is to be printed to. -- */ ++ * Pause or unpause all kthread printers. ++ * ++ * Requires the console_lock. + */ -static struct console *exclusive_console; -- ++static void __pause_all_consoles(bool do_pause) ++{ ++ struct console *con; ++ ++ for_each_console(con) { ++ mutex_lock(&con->lock); ++ if (do_pause) ++ con->flags |= CON_PAUSED; ++ else ++ con->flags &= ~CON_PAUSED; ++ mutex_unlock(&con->lock); ++ } ++ ++ consoles_paused = do_pause; ++} ++ ++#define pause_all_consoles() __pause_all_consoles(true) ++#define unpause_all_consoles() __pause_all_consoles(false) ++ ++static int console_suspended; + /* * Array of consoles built from command line options (console=) - */ -@@ -350,10 +333,13 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; - * non-prinatable characters are escaped in the "\xff" notation. - */ - -+#ifdef CONFIG_PRINTK +@@ -353,6 +392,20 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); --#ifdef CONFIG_PRINTK -+/* Set to enable sync mode. Once set, it is never cleared. */ -+static bool sync_mode; ++/* ++ * A flag to signify if printk_late_init() has already started the kthread ++ * printers. If true, any later registered consoles must start their own ++ * kthread directly. The flag is write protected by the console_lock. ++ */ ++static bool kthreads_started; ++ ++static inline bool kthread_printers_active(void) ++{ ++ return (kthreads_started && ++ system_state == SYSTEM_RUNNING && ++ !oops_in_progress); ++} + + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ - /* the next printk record to read by syslog(READ) or /proc/kmsg */ -@@ -361,17 +347,6 @@ static u64 syslog_seq; +@@ -361,12 +414,6 @@ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; @@ -6223,303 +5375,72 @@ index 99221b016c68..0cc8e8acf545 100644 -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - --struct latched_seq { -- seqcount_latch_t latch; -- u64 val[2]; --}; -- - /* - * The next printk record to read after the last 'clear' command. There are - * two copies (updated with seqcount_latch) so that reads can locklessly -@@ -389,9 +364,6 @@ static struct latched_seq clear_seq = { - #define PREFIX_MAX 32 - #endif + struct latched_seq { + seqcount_latch_t latch; + u64 val[2]; +@@ -392,6 +439,9 @@ static struct latched_seq clear_seq = { + /* the maximum size of a formatted record (i.e. with prefix added per line) */ + #define CONSOLE_LOG_MAX 1024 --/* the maximum size of a formatted record (i.e. with prefix added per line) */ --#define CONSOLE_LOG_MAX 1024 -- ++/* the maximum size for a dropped text message */ ++#define DROPPED_TEXT_MAX 64 ++ /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) -@@ -430,12 +402,12 @@ static struct printk_ringbuffer *prb = &printk_rb_static; - */ - static bool __printk_percpu_data_ready __read_mostly; - --bool printk_percpu_data_ready(void) -+static bool printk_percpu_data_ready(void) - { - return __printk_percpu_data_ready; - } - --/* Must be called under syslog_lock. */ -+/* Must be called under associated write-protection lock. */ - static void latched_seq_write(struct latched_seq *ls, u64 val) - { - raw_write_seqcount_latch(&ls->latch); -@@ -1747,188 +1719,152 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) - return do_syslog(type, buf, len, SYSLOG_FROM_READER); +@@ -1823,6 +1873,7 @@ static int console_lock_spinning_disable_and_check(void) + return 1; } --/* -- * Special console_lock variants that help to reduce the risk of soft-lockups. -- * They allow to pass console_lock to another printk() call using a busy wait. -- */ -+int printk_delay_msec __read_mostly; - --#ifdef CONFIG_LOCKDEP --static struct lockdep_map console_owner_dep_map = { -- .name = "console_owner" --}; --#endif -+static inline void printk_delay(int level) -+{ -+ boot_delay_msec(level); ++#if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + /** + * console_trylock_spinning - try to get console_lock by busy waiting + * +@@ -1886,49 +1937,38 @@ static int console_trylock_spinning(void) --static DEFINE_RAW_SPINLOCK(console_owner_lock); --static struct task_struct *console_owner; --static bool console_waiter; -+ if (unlikely(printk_delay_msec)) { -+ int m = printk_delay_msec; + return 1; + } ++#endif /* CONFIG_PREEMPT_RT */ --/** -- * console_lock_spinning_enable - mark beginning of code where another -- * thread might safely busy wait -- * -- * This basically converts console_lock into a spinlock. This marks -- * the section where the console_lock owner can not sleep, because -- * there may be a waiter spinning (like a spinlock). Also it must be -- * ready to hand over the lock at the end of the section. -- */ --static void console_lock_spinning_enable(void) -+ while (m--) { -+ mdelay(1); -+ touch_nmi_watchdog(); -+ } -+ } -+} -+ -+static bool kernel_sync_mode(void) - { -- raw_spin_lock(&console_owner_lock); -- console_owner = current; -- raw_spin_unlock(&console_owner_lock); -+ return (oops_in_progress || sync_mode); -+} - -- /* The waiter may spin on us after setting console_owner */ -- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); -+static bool console_may_sync(struct console *con) -+{ -+ if (!(con->flags & CON_ENABLED)) -+ return false; -+ if (con->write_atomic && kernel_sync_mode()) -+ return true; -+ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) -+ return true; -+ if (con->write && (con->flags & CON_BOOT) && !con->thread) -+ return true; -+ return false; - } - --/** -- * console_lock_spinning_disable_and_check - mark end of code where another -- * thread was able to busy wait and check if there is a waiter -- * -- * This is called at the end of the section where spinning is allowed. -- * It has two functions. First, it is a signal that it is no longer -- * safe to start busy waiting for the lock. Second, it checks if -- * there is a busy waiter and passes the lock rights to her. -- * -- * Important: Callers lose the lock if there was a busy waiter. -- * They must not touch items synchronized by console_lock -- * in this case. -- * -- * Return: 1 if the lock rights were passed, 0 otherwise. -- */ --static int console_lock_spinning_disable_and_check(void) -+static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) - { -- int waiter; -+ if (!(con->flags & CON_ENABLED)) -+ return false; - -- raw_spin_lock(&console_owner_lock); -- waiter = READ_ONCE(console_waiter); -- console_owner = NULL; -- raw_spin_unlock(&console_owner_lock); -+ if (con->write_atomic && kernel_sync_mode()) { -+ con->write_atomic(con, text, text_len); -+ return true; -+ } - -- if (!waiter) { -- spin_release(&console_owner_dep_map, _THIS_IP_); -- return 0; -+ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) { -+ if (console_trylock()) { -+ con->write_atomic(con, text, text_len); -+ console_unlock(); -+ return true; -+ } -+ -+ } else if (con->write && (con->flags & CON_BOOT) && !con->thread) { -+ if (console_trylock()) { -+ con->write(con, text, text_len); -+ console_unlock(); -+ return true; -+ } - } - -- /* The waiter is now free to continue */ -- WRITE_ONCE(console_waiter, false); -+ return false; -+} - -- spin_release(&console_owner_dep_map, _THIS_IP_); -+static bool have_atomic_console(void) -+{ -+ struct console *con; - -- /* -- * Hand off console_lock to waiter. The waiter will perform -- * the up(). After this, the waiter is the console_lock owner. -- */ -- mutex_release(&console_lock_dep_map, _THIS_IP_); -- return 1; -+ for_each_console(con) { -+ if (!(con->flags & CON_ENABLED)) -+ continue; -+ if (con->write_atomic) -+ return true; -+ } -+ return false; - } - --/** -- * console_trylock_spinning - try to get console_lock by busy waiting -- * -- * This allows to busy wait for the console_lock when the current -- * owner is running in specially marked sections. It means that -- * the current owner is running and cannot reschedule until it -- * is ready to lose the lock. -- * -- * Return: 1 if we got the lock, 0 othrewise -- */ --static int console_trylock_spinning(void) -+static bool print_sync(struct console *con, u64 *seq) - { -- struct task_struct *owner = NULL; -- bool waiter; -- bool spin = false; -- unsigned long flags; -+ struct printk_info info; -+ struct printk_record r; -+ size_t text_len; - -- if (console_trylock()) -- return 1; -+ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); - -- printk_safe_enter_irqsave(flags); -+ if (!prb_read_valid(prb, *seq, &r)) -+ return false; - -- raw_spin_lock(&console_owner_lock); -- owner = READ_ONCE(console_owner); -- waiter = READ_ONCE(console_waiter); -- if (!waiter && owner && owner != current) { -- WRITE_ONCE(console_waiter, true); -- spin = true; -- } -- raw_spin_unlock(&console_owner_lock); -+ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); - -- /* -- * If there is an active printk() writing to the -- * consoles, instead of having it write our data too, -- * see if we can offload that load from the active -- * printer, and do some printing ourselves. -- * Go into a spin only if there isn't already a waiter -- * spinning, and there is an active printer, and -- * that active printer isn't us (recursive printk?). -- */ -- if (!spin) { -- printk_safe_exit_irqrestore(flags); -- return 0; -- } -+ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) -+ return false; - -- /* We spin waiting for the owner to release us */ -- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); -- /* Owner will clear console_waiter on hand off */ -- while (READ_ONCE(console_waiter)) -- cpu_relax(); -- spin_release(&console_owner_dep_map, _THIS_IP_); -+ *seq = r.info->seq; - -- printk_safe_exit_irqrestore(flags); -- /* -- * The owner passed the console lock to us. -- * Since we did not spin on console lock, annotate -- * this as a trylock. Otherwise lockdep will -- * complain. -- */ -- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); -+ touch_softlockup_watchdog_sync(); -+ clocksource_touch_watchdog(); -+ rcu_cpu_stall_reset(); -+ touch_nmi_watchdog(); - -- return 1; -+ if (text_len) -+ printk_delay(r.info->level); -+ -+ return true; - } - --/* + /* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. -- */ ++ * Call the specified console driver, asking it to write out the specified ++ * text and length. If @dropped_text is non-NULL and any records have been ++ * dropped, a dropped message will be written out first. + */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) -+static u64 read_console_seq(struct console *con) ++static void call_console_driver(struct console *con, const char *text, size_t len, ++ char *dropped_text, bool atomic_printing) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; -+ u64 seq2; -+ u64 seq; ++ unsigned long dropped = 0; ++ size_t dropped_len; -- trace_console_rcuidle(text, len); -+ seq = latched_seq_read_nolock(&con->printk_seq); -+ seq2 = latched_seq_read_nolock(&con->printk_sync_seq); -+ if (seq2 > seq) -+ seq = seq2; -+#ifdef CONFIG_HAVE_NMI -+ seq2 = latched_seq_read_nolock(&con->printk_sync_nmi_seq); -+ if (seq2 > seq) -+ seq = seq2; -+#endif -+ return seq; -+} + trace_console_rcuidle(text, len); - if (!console_drivers) - return; -+static void print_sync_until(struct console *con, u64 seq, bool is_locked) -+{ -+ u64 printk_seq; ++ if (dropped_text) ++ dropped = atomic_long_xchg_relaxed(&con->dropped, 0); - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), -- "** %lu printk messages dropped **\n", ++ if (dropped) { ++ dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, + "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; -- } -+ while (!__printk_cpu_trylock()) -+ cpu_relax(); ++ dropped); ++ if (atomic_printing) ++ con->write_atomic(con, dropped_text, dropped_len); ++ else ++ con->write(con, dropped_text, dropped_len); + } - for_each_console(con) { - if (exclusive_console && con != exclusive_console) @@ -6538,419 +5459,641 @@ index 99221b016c68..0cc8e8acf545 100644 - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } -+ for (;;) { -+ printk_seq = read_console_seq(con); -+ if (printk_seq >= seq) -+ break; -+ if (!print_sync(con, &printk_seq)) -+ break; -+ -+ if (is_locked) -+ latched_seq_write(&con->printk_seq, printk_seq + 1); -+#ifdef CONFIG_PRINTK_NMI -+ else if (in_nmi()) -+ latched_seq_write(&con->printk_sync_nmi_seq, printk_seq + 1); -+#endif -+ else -+ latched_seq_write(&con->printk_sync_seq, printk_seq + 1); - } -+ -+ __printk_cpu_unlock(); +- } ++ if (atomic_printing) ++ con->write_atomic(con, text, len); ++ else ++ con->write(con, text, len); } /* -@@ -2001,20 +1937,6 @@ static u8 *__printk_recursion_counter(void) - local_irq_restore(flags); \ - } while (0) - --int printk_delay_msec __read_mostly; -- --static inline void printk_delay(void) --{ -- if (unlikely(printk_delay_msec)) { -- int m = printk_delay_msec; -- -- while (m--) { -- mdelay(1); -- touch_nmi_watchdog(); -- } -- } --} -- +@@ -2018,7 +2058,7 @@ static inline void printk_delay(void) static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : -@@ -2095,13 +2017,14 @@ static u16 printk_sprint(char *text, u16 size, int facility, +- 0x80000000 + raw_smp_processor_id(); ++ 0x80000000 + smp_processor_id(); } - __printf(4, 0) --int vprintk_store(int facility, int level, -- const struct dev_printk_info *dev_info, -- const char *fmt, va_list args) -+static int vprintk_store(int facility, int level, -+ const struct dev_printk_info *dev_info, -+ const char *fmt, va_list args) + /** +@@ -2100,7 +2140,6 @@ int vprintk_store(int facility, int level, + const struct dev_printk_info *dev_info, + const char *fmt, va_list args) { - const u32 caller_id = printk_caller_id(); +- const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; -+ bool final_commit = false; struct printk_record r; - unsigned long irqflags; - u16 trunc_msg_len = 0; -@@ -2112,6 +2035,7 @@ int vprintk_store(int facility, int level, +@@ -2110,10 +2149,14 @@ int vprintk_store(int facility, int level, + u8 *recursion_ptr; + u16 reserve_size; + va_list args2; ++ u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; -+ u64 seq; ++ if (!printk_enter_irqsave(recursion_ptr, irqflags)) ++ return 0; ++ /* * Since the duration of printk() can vary depending on the message -@@ -2150,6 +2074,7 @@ int vprintk_store(int facility, int level, - if (flags & LOG_CONT) { - prb_rec_init_wr(&r, reserve_size); - if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { -+ seq = r.info->seq; - text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, - facility, &flags, fmt, args); - r.info->text_len += text_len; -@@ -2157,6 +2082,7 @@ int vprintk_store(int facility, int level, - if (flags & LOG_NEWLINE) { - r.info->flags |= LOG_NEWLINE; - prb_final_commit(&e); -+ final_commit = true; - } else { - prb_commit(&e); - } -@@ -2180,6 +2106,7 @@ int vprintk_store(int facility, int level, - if (!prb_reserve(&e, prb, &r)) - goto out; - } -+ seq = r.info->seq; - - /* fill message */ - text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); -@@ -2195,13 +2122,25 @@ int vprintk_store(int facility, int level, - memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); - - /* A message without a trailing newline can be continued. */ -- if (!(flags & LOG_NEWLINE)) -+ if (!(flags & LOG_NEWLINE)) { - prb_commit(&e); -- else -+ } else { - prb_final_commit(&e); -+ final_commit = true; -+ } + * and state of the ringbuffer, grab the timestamp now so that it is +@@ -2122,8 +2165,7 @@ int vprintk_store(int facility, int level, + */ + ts_nsec = local_clock(); - ret = text_len + trunc_msg_len; - out: -+ /* only the kernel may perform synchronous printing */ -+ if (facility == 0 && final_commit) { -+ struct console *con; -+ -+ for_each_console(con) { -+ if (console_may_sync(con)) -+ print_sync_until(con, seq + 1, false); -+ } -+ } -+ - printk_exit_irqrestore(recursion_ptr, irqflags); - return ret; - } -@@ -2211,50 +2150,43 @@ asmlinkage int vprintk_emit(int facility, int level, - const char *fmt, va_list args) - { - int printed_len; -- bool in_sched = false; +- if (!printk_enter_irqsave(recursion_ptr, irqflags)) +- return 0; ++ caller_id = printk_caller_id(); - /* Suppress unimportant messages after panic happens */ - if (unlikely(suppress_printk)) - return 0; + /* + * The sprintf needs to come first since the syslog prefix might be +@@ -2223,27 +2265,36 @@ asmlinkage int vprintk_emit(int facility, int level, + in_sched = true; + } -- if (level == LOGLEVEL_SCHED) { -+ if (level == LOGLEVEL_SCHED) - level = LOGLEVEL_DEFAULT; -- in_sched = true; -- } -- - boot_delay_msec(level); - printk_delay(); - +- printed_len = vprintk_store(facility, level, dev_info, fmt, args); -- /* If called from the scheduler, we can not call up(). */ + /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { -- /* -- * Disable preemption to avoid being preempted while holding -- * console_sem which would prevent anyone from printing to -- * console -- */ -- preempt_disable(); ++ if (!in_sched && !kthread_printers_active()) { ++ /* ++ * Try to acquire and then immediately release the console ++ * semaphore. The release will print out buffers. ++ */ ++#if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ /* ++ * Use the non-spinning trylock since PREEMPT_RT does not ++ * support console lock handovers. ++ * ++ * Direct printing will most likely involve taking spinlocks. ++ * For PREEMPT_RT, this is only allowed if in a preemptible ++ * context. ++ */ ++ if (preemptible() && console_trylock()) ++ console_unlock(); ++#else + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); - /* - * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. - */ -- if (console_trylock_spinning()) -- console_unlock(); -- preempt_enable(); -- } -- - wake_up_klogd(); - return printed_len; - } - EXPORT_SYMBOL(vprintk_emit); - --int vprintk_default(const char *fmt, va_list args) -+__printf(1, 0) -+static int vprintk_default(const char *fmt, va_list args) - { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - } --EXPORT_SYMBOL_GPL(vprintk_default); -+ -+__printf(1, 0) -+static int vprintk_func(const char *fmt, va_list args) -+{ -+#ifdef CONFIG_KGDB_KDB -+ /* Allow to pass printk() to kdb but avoid a recursion. */ -+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) -+ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); + if (console_trylock_spinning()) + console_unlock(); + preempt_enable(); +#endif -+ return vprintk_default(fmt, args); -+} -+ -+asmlinkage int vprintk(const char *fmt, va_list args) -+{ -+ return vprintk_func(fmt, args); -+} -+EXPORT_SYMBOL(vprintk); + } - asmlinkage __visible int _printk(const char *fmt, ...) - { -@@ -2269,37 +2201,162 @@ asmlinkage __visible int _printk(const char *fmt, ...) + wake_up_klogd(); +@@ -2270,18 +2321,91 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); --#else /* CONFIG_PRINTK */ -+static int printk_kthread_func(void *data) ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++static void __free_atomic_data(struct console_atomic_data *d) +{ -+ struct console *con = data; -+ unsigned long dropped = 0; -+ char *dropped_text = NULL; -+ struct printk_info info; -+ struct printk_record r; -+ char *ext_text = NULL; -+ size_t dropped_len; -+ int ret = -ENOMEM; -+ char *text = NULL; -+ char *write_text; -+ size_t len; -+ int error; -+ u64 seq; - --#define CONSOLE_LOG_MAX 0 --#define printk_time false -+ if (con->flags & CON_EXTENDED) { -+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); -+ if (!ext_text) -+ goto out; -+ } -+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); -+ dropped_text = kmalloc(64, GFP_KERNEL); -+ if (!text || !dropped_text) -+ goto out; -+ if (con->flags & CON_EXTENDED) -+ write_text = ext_text; -+ else -+ write_text = text; - --#define prb_read_valid(rb, seq, r) false --#define prb_first_valid_seq(rb) 0 -+ seq = read_console_seq(con); - --static u64 syslog_seq; --static u64 console_seq; --static u64 exclusive_console_stop_seq; --static unsigned long console_dropped; -+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); -+ -+ for (;;) { -+ error = wait_event_interruptible(log_wait, -+ prb_read_valid(prb, seq, &r) || kthread_should_stop()); -+ -+ if (kthread_should_stop()) -+ break; ++ kfree(d->text); ++ kfree(d->ext_text); ++ kfree(d->dropped_text); ++} + -+ if (error) -+ continue; ++static void free_atomic_data(struct console_atomic_data *d) ++{ ++ int count = 1; ++ int i; + -+ if (seq != r.info->seq) { -+ dropped += r.info->seq - seq; -+ seq = r.info->seq; -+ } ++ if (!d) ++ return; + -+ seq++; ++#ifdef CONFIG_HAVE_NMI ++ count = 2; ++#endif + -+ if (!(con->flags & CON_ENABLED)) -+ continue; ++ for (i = 0; i < count; i++) ++ __free_atomic_data(&d[i]); ++ kfree(d); ++} + -+ if (suppress_message_printing(r.info->level)) -+ continue; ++static int __alloc_atomic_data(struct console_atomic_data *d, short flags) ++{ ++ d->text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); ++ if (!d->text) ++ return -1; ++ ++ if (flags & CON_EXTENDED) { ++ d->ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ if (!d->ext_text) ++ return -1; ++ } else { ++ d->dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); ++ if (!d->dropped_text) ++ return -1; ++ } + -+ if (con->flags & CON_EXTENDED) { -+ len = info_print_ext_header(ext_text, -+ CONSOLE_EXT_LOG_MAX, -+ r.info); -+ len += msg_print_ext_body(ext_text + len, -+ CONSOLE_EXT_LOG_MAX - len, -+ &r.text_buf[0], r.info->text_len, -+ &r.info->dev_info); -+ } else { -+ len = record_print_text(&r, -+ console_msg_format & MSG_FORMAT_SYSLOG, -+ printk_time); -+ } ++ return 0; ++} + -+ console_lock(); ++static struct console_atomic_data *alloc_atomic_data(short flags) ++{ ++ struct console_atomic_data *d; ++ int count = 1; ++ int i; + -+ /* -+ * Even though the printk kthread is always preemptible, it is -+ * still not allowed to call cond_resched() from within -+ * console drivers. The task may become non-preemptible in the -+ * console driver call chain. For example, vt_console_print() -+ * takes a spinlock and then can call into fbcon_redraw(), -+ * which can conditionally invoke cond_resched(). -+ */ -+ console_may_schedule = 0; ++#ifdef CONFIG_HAVE_NMI ++ count = 2; ++#endif + -+ if (kernel_sync_mode() && con->write_atomic) { -+ console_unlock(); -+ break; -+ } ++ d = kzalloc(sizeof(*d) * count, GFP_KERNEL); ++ if (!d) ++ goto err_out; + -+ if (!(con->flags & CON_EXTENDED) && dropped) { -+ dropped_len = snprintf(dropped_text, 64, -+ "** %lu printk messages dropped **\n", -+ dropped); -+ dropped = 0; ++ for (i = 0; i < count; i++) { ++ if (__alloc_atomic_data(&d[i], flags) != 0) ++ goto err_out; ++ } + -+ con->write(con, dropped_text, dropped_len); -+ printk_delay(r.info->level); -+ } ++ return d; ++err_out: ++ free_atomic_data(d); ++ return NULL; ++} ++#endif /* CONFIG_HAVE_ATOMIC_CONSOLE */ + -+ con->write(con, write_text, len); -+ if (len) -+ printk_delay(r.info->level); ++static void start_printk_kthread(struct console *con); + -+ latched_seq_write(&con->printk_seq, seq); + #else /* CONFIG_PRINTK */ + + #define CONSOLE_LOG_MAX 0 ++#define DROPPED_TEXT_MAX 0 + #define printk_time false + + #define prb_read_valid(rb, seq, r) false + #define prb_first_valid_seq(rb) 0 ++#define prb_next_seq(rb) 0 + -+ console_unlock(); -+ } -+ ret = 0; -+out: -+ kfree(dropped_text); -+ kfree(text); -+ kfree(ext_text); -+ pr_info("%sconsole [%s%d]: printing thread stopped\n", -+ (con->flags & CON_BOOT) ? "boot" : "", -+ con->name, con->index); -+ return ret; -+} ++#define free_atomic_data(d) --static size_t record_print_text(const struct printk_record *r, -- bool syslog, bool time) -+/* Must be called within console_lock(). */ -+static void start_printk_kthread(struct console *con) - { -- return 0; -+ con->thread = kthread_run(printk_kthread_func, con, -+ "pr/%s%d", con->name, con->index); -+ if (IS_ERR(con->thread)) { -+ pr_err("%sconsole [%s%d]: unable to start printing thread\n", -+ (con->flags & CON_BOOT) ? "boot" : "", -+ con->name, con->index); -+ return; -+ } -+ pr_info("%sconsole [%s%d]: printing thread started\n", -+ (con->flags & CON_BOOT) ? "boot" : "", -+ con->name, con->index); - } --static ssize_t info_print_ext_header(char *buf, size_t size, -- struct printk_info *info) -+ -+/* protected by console_lock */ -+static bool kthreads_started; -+ -+/* Must be called within console_lock(). */ -+static void console_try_thread(struct console *con) - { -- return 0; -+ if (kthreads_started) { -+ start_printk_kthread(con); -+ return; -+ } -+ -+ /* -+ * The printing threads have not been started yet. If this console -+ * can print synchronously, print all unprinted messages. -+ */ -+ if (console_may_sync(con)) { -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ print_sync_until(con, prb_next_seq(prb), true); -+ local_irq_restore(flags); -+ } - } --static ssize_t msg_print_ext_body(char *buf, size_t size, -- char *text, size_t text_len, -- struct dev_printk_info *dev_info) { return 0; } --static void console_lock_spinning_enable(void) { } --static int console_lock_spinning_disable_and_check(void) { return 0; } + static u64 syslog_seq; +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; + + static size_t record_print_text(const struct printk_record *r, + bool syslog, bool time) +@@ -2298,9 +2422,10 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, + struct dev_printk_info *dev_info) { return 0; } + static void console_lock_spinning_enable(void) { } + static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} --static bool suppress_message_printing(int level) { return false; } ++static void call_console_driver(struct console *con, const char *text, size_t len, ++ char *dropped_text, bool atomic_printing) {} + static bool suppress_message_printing(int level) { return false; } ++static void start_printk_kthread(struct console *con) {} #endif /* CONFIG_PRINTK */ -@@ -2556,34 +2613,6 @@ int is_console_locked(void) +@@ -2476,6 +2601,7 @@ void suspend_console(void) + if (!console_suspend_enabled) + return; + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); ++ pr_flush(1000, true); + console_lock(); + console_suspended = 1; + up_console_sem(); +@@ -2488,6 +2614,7 @@ void resume_console(void) + down_console_sem(); + console_suspended = 0; + console_unlock(); ++ pr_flush(1000, true); + } + + /** +@@ -2524,7 +2651,7 @@ void console_lock(void) + down_console_sem(); + if (console_suspended) + return; +- console_locked = 1; ++ pause_all_consoles(); + console_may_schedule = 1; + } + EXPORT_SYMBOL(console_lock); +@@ -2545,46 +2672,387 @@ int console_trylock(void) + up_console_sem(); + return 0; + } +- console_locked = 1; ++ if (!console_excl_trylock()) { ++ up_console_sem(); ++ return 0; ++ } + console_may_schedule = 0; + return 1; + } + EXPORT_SYMBOL(console_trylock); + ++/* ++ * This is used to help to make sure that certain paths within the VT code are ++ * running with the console lock held. It is definitely not the perfect debug ++ * tool (it is not known if the VT code is the task holding the console lock), ++ * but it helps tracking those weird code paths in the console code such as ++ * when the console is suspended: where the console is not locked but no ++ * console printing may occur. ++ * ++ * Note: This returns true when the console is suspended but is not locked. ++ * This is intentional because the VT code must consider that situation ++ * the same as if the console was locked. ++ */ + int is_console_locked(void) + { +- return console_locked; ++ return (consoles_paused || atomic_read(&console_lock_count)); } EXPORT_SYMBOL(is_console_locked); --/* + /* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. -- */ ++ * Check if the given console is currently capable and allowed to print ++ * records. ++ * ++ * Requires the console_lock. + */ -static int have_callable_console(void) --{ ++static inline bool console_is_usable(struct console *con, bool atomic_printing) + { - struct console *con; -- ++ if (!(con->flags & CON_ENABLED)) ++ return false; + - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; -- ++ if (atomic_printing) { ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++ if (!con->write_atomic) ++ return false; ++ if (!con->atomic_data) ++ return false; ++#else ++ return false; ++#endif ++ } else if (!con->write) { ++ return false; ++ } + - return 0; --} -- --/* ++ /* ++ * Console drivers may assume that per-cpu resources have been ++ * allocated. So unless they're explicitly marked as being able to ++ * cope (CON_ANYTIME) don't call them until per-cpu resources have ++ * been allocated. ++ */ ++ if (!printk_percpu_data_ready() && ++ !(con->flags & CON_ANYTIME)) ++ return false; ++ ++ return true; ++} ++ ++static void __console_unlock(void) ++{ ++ /* ++ * Depending on whether console_lock() or console_trylock() was used, ++ * appropriately allow the kthread printers to continue. ++ */ ++ if (consoles_paused) ++ unpause_all_consoles(); ++ else ++ console_excl_unlock(); ++ ++ /* Wake the kthread printers. */ ++ wake_up_klogd(); ++ ++ up_console_sem(); ++} ++ ++static u64 read_console_seq(struct console *con) ++{ ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++ unsigned long flags; ++ u64 seq2; ++ u64 seq; ++ ++ if (!con->atomic_data) ++ return con->seq; ++ ++ printk_cpu_sync_get_irqsave(flags); ++ ++ seq = con->seq; ++ seq2 = con->atomic_data[0].seq; ++ if (seq2 > seq) ++ seq = seq2; ++#ifdef CONFIG_HAVE_NMI ++ seq2 = con->atomic_data[1].seq; ++ if (seq2 > seq) ++ seq = seq2; ++#endif ++ ++ printk_cpu_sync_put_irqrestore(flags); ++ ++ return seq; ++#else /* CONFIG_HAVE_ATOMIC_CONSOLE */ ++ return con->seq; ++#endif ++} ++ ++static void write_console_seq(struct console *con, u64 val, bool atomic_printing) ++{ ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++ unsigned long flags; ++ u64 *seq; ++ ++ if (!con->atomic_data) { ++ con->seq = val; ++ return; ++ } ++ ++ printk_cpu_sync_get_irqsave(flags); ++ ++ if (atomic_printing) { ++ seq = &con->atomic_data[0].seq; ++#ifdef CONFIG_HAVE_NMI ++ if (in_nmi()) ++ seq = &con->atomic_data[1].seq; ++#endif ++ } else { ++ seq = &con->seq; ++ } ++ *seq = val; ++ ++ printk_cpu_sync_put_irqrestore(flags); ++#else /* CONFIG_HAVE_ATOMIC_CONSOLE */ ++ con->seq = val; ++#endif + } + + /* - * Can we actually use the console at this time on this cpu? -- * ++ * Print one record for the given console. The record printed is whatever ++ * record is the next available record for the given console. ++ * ++ * @text is a buffer of size CONSOLE_LOG_MAX. ++ * ++ * If extended messages should be printed, @ext_text is a buffer of size ++ * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. ++ * ++ * If dropped messages should be printed, @dropped_text is a buffer of size ++ * DROPPED_TEXT_MAX. Otherise @dropped_text must be NULL. ++ * ++ * @atomic_printing specifies if atomic printing should be used. ++ * ++ * Requires the console_lock. + * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. -- */ ++ * Returns false if the given console has no next record to print, otherwise ++ * true. ++ * ++ * @handover will be set to true if a printk waiter has taken over the ++ * console_lock, in which case the caller is no longer holding the ++ * console_lock. A NULL pointer may be provided to disable allowing ++ * the console_lock to be taken over by a printk waiter. + */ -static inline int can_use_console(void) --{ ++static bool console_emit_next_record(struct console *con, char *text, char *ext_text, ++ char *dropped_text, bool atomic_printing, ++ bool *handover) + { - return cpu_online(raw_smp_processor_id()) || have_callable_console(); --} -- ++ struct printk_info info; ++ struct printk_record r; ++ unsigned long flags; ++ bool allow_handover; ++ char *write_text; ++ size_t len; ++ u64 seq; ++ ++ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); ++ ++ if (handover) ++ *handover = false; ++ ++ seq = read_console_seq(con); ++ ++ if (!prb_read_valid(prb, seq, &r)) ++ return false; ++ ++ if (seq != r.info->seq) { ++ atomic_long_add((unsigned long)(r.info->seq - seq), &con->dropped); ++ write_console_seq(con, r.info->seq, atomic_printing); ++ seq = r.info->seq; ++ } ++ ++ /* Skip record that has level above the console loglevel. */ ++ if (suppress_message_printing(r.info->level)) { ++ write_console_seq(con, seq + 1, atomic_printing); ++ goto skip; ++ } ++ ++ if (ext_text) { ++ write_text = ext_text; ++ len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); ++ len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, ++ &r.text_buf[0], r.info->text_len, &r.info->dev_info); ++ } else { ++ write_text = text; ++ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); ++ } ++ ++#if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ /* PREEMPT_RT does not support console lock handovers. */ ++ allow_handover = false; ++#else ++ /* Handovers may only happen between trylock contexts. */ ++ allow_handover = (handover && atomic_read(&console_lock_count) == -1); ++#endif ++ ++ if (allow_handover) { ++ /* ++ * While actively printing out messages, if another printk() ++ * were to occur on another CPU, it may wait for this one to ++ * finish. This task can not be preempted if there is a ++ * waiter waiting to take over. ++ * ++ * Interrupts are disabled because the hand over to a waiter ++ * must not be interrupted until the hand over is completed ++ * (@console_waiter is cleared). ++ */ ++ printk_safe_enter_irqsave(flags); ++ console_lock_spinning_enable(); ++ } ++ ++ stop_critical_timings(); /* don't trace print latency */ ++ call_console_driver(con, write_text, len, dropped_text, atomic_printing); ++ start_critical_timings(); ++ ++ write_console_seq(con, seq + 1, atomic_printing); ++ ++ if (allow_handover) { ++ *handover = console_lock_spinning_disable_and_check(); ++ printk_safe_exit_irqrestore(flags); ++ } ++ ++ boot_delay_msec(r.info->level); ++ printk_delay(); ++skip: ++ return true; + } + ++/* ++ * Print out all remaining records to all consoles. ++ * ++ * Requires the console_lock. ++ * ++ * Returns true if a console was available for flushing, otherwise false. ++ * ++ * @next_seq is set to the highest sequence number of all of the consoles that ++ * were flushed. ++ * ++ * @handover will be set to true if a printk waiter has taken over the ++ * console_lock, in which case the caller is no longer holding the ++ * console_lock. ++ */ ++static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) ++{ ++ static char dropped_text[DROPPED_TEXT_MAX]; ++ static char ext_text[CONSOLE_EXT_LOG_MAX]; ++ static char text[CONSOLE_LOG_MAX]; ++ bool any_usable = false; ++ struct console *con; ++ bool any_progress; ++ ++ *next_seq = 0; ++ *handover = false; ++ ++ do { ++ /* Let the kthread printers do the work if they can. */ ++ if (kthread_printers_active()) ++ return false; ++ ++ any_progress = false; ++ ++ for_each_console(con) { ++ bool progress; ++ ++ if (!console_is_usable(con, false)) ++ continue; ++ if ((con->flags & CON_MIGHT_SLEEP) && !do_cond_resched) ++ continue; ++ any_usable = true; ++ ++ if (con->flags & CON_EXTENDED) { ++ /* Extended consoles do not print "dropped messages". */ ++ progress = console_emit_next_record(con, &text[0], ++ &ext_text[0], NULL, ++ false, handover); ++ } else { ++ progress = console_emit_next_record(con, &text[0], ++ NULL, &dropped_text[0], ++ false, handover); ++ } ++ if (*handover) ++ return true; ++ ++ /* Track the highest seq flushed. */ ++ if (con->seq > *next_seq) ++ *next_seq = con->seq; ++ ++ if (!progress) ++ continue; ++ any_progress = true; ++ ++ if (do_cond_resched) ++ cond_resched(); ++ } ++ } while (any_progress); ++ ++ return any_usable; ++} ++ ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++static void atomic_console_flush_all(void) ++{ ++ bool any_usable = false; ++ unsigned long flags; ++ struct console *con; ++ bool any_progress; ++ int index = 0; ++ ++ if (console_suspended) ++ return; ++ ++#ifdef CONFIG_HAVE_NMI ++ if (in_nmi()) ++ index = 1; ++#endif ++ ++ printk_cpu_sync_get_irqsave(flags); ++ ++ do { ++ any_progress = false; ++ ++ for_each_console(con) { ++ bool progress; ++ ++ if (!console_is_usable(con, true)) ++ continue; ++ any_usable = true; ++ ++ if (con->flags & CON_EXTENDED) { ++ /* Extended consoles do not print "dropped messages". */ ++ progress = console_emit_next_record(con, ++ &con->atomic_data->text[index], ++ &con->atomic_data->ext_text[index], ++ NULL, ++ true, NULL); ++ } else { ++ progress = console_emit_next_record(con, ++ &con->atomic_data->text[index], ++ NULL, ++ &con->atomic_data->dropped_text[index], ++ true, NULL); ++ } ++ ++ if (!progress) ++ continue; ++ any_progress = true; ++ ++ touch_softlockup_watchdog_sync(); ++ clocksource_touch_watchdog(); ++ rcu_cpu_stall_reset(); ++ touch_nmi_watchdog(); ++ } ++ } while (any_progress); ++ ++ printk_cpu_sync_put_irqrestore(flags); ++} ++#else /* CONFIG_HAVE_ATOMIC_CONSOLE */ ++#define atomic_console_flush_all() ++#endif ++ /** * console_unlock - unlock the console system * -@@ -2600,140 +2629,13 @@ static inline int can_use_console(void) +@@ -2601,21 +3069,16 @@ static inline int can_use_console(void) */ void console_unlock(void) { @@ -6961,7 +6104,11 @@ index 99221b016c68..0cc8e8acf545 100644 - struct printk_info info; - struct printk_record r; - u64 __maybe_unused next_seq; -- ++ bool do_cond_resched; ++ bool handover; ++ bool flushed; ++ u64 next_seq; + if (console_suspended) { up_console_sem(); return; @@ -6969,24 +6116,16 @@ index 99221b016c68..0cc8e8acf545 100644 - prb_rec_init_rd(&r, &info, text, sizeof(text)); - -- /* -- * Console drivers are called with interrupts disabled, so -- * @console_may_schedule should be cleared before; however, we may -- * end up dumping a lot of lines, for example, if called from -- * console registration path, and should invoke cond_resched() -- * between lines if allowable. Not doing so can cause a very long -- * scheduling stall on a slow console leading to RCU stall and -- * softlockup warnings which exacerbate the issue with more -- * messages practically incapacitating the system. -- * -- * console_trylock() is not able to detect the preemptive -- * context reliably. Therefore the value must be stored before -- * and cleared after the "again" goto label. -- */ -- do_cond_resched = console_may_schedule; + /* + * Console drivers are called with interrupts disabled, so + * @console_may_schedule should be cleared before; however, we may +@@ -2631,110 +3094,27 @@ void console_unlock(void) + * and cleared after the "again" goto label. + */ + do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; -- + - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME @@ -7002,11 +6141,15 @@ index 99221b016c68..0cc8e8acf545 100644 - size_t ext_len = 0; - int handover; - size_t len; -- ++ do { ++ console_may_schedule = 0; + -skip: - if (!prb_read_valid(prb, console_seq, &r)) -- break; -- ++ flushed = console_flush_all(do_cond_resched, &next_seq, &handover); ++ if (handover) + break; + - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; @@ -7027,7 +6170,8 @@ index 99221b016c68..0cc8e8acf545 100644 - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } -- ++ __console_unlock(); + - /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. @@ -7046,9 +6190,12 @@ index 99221b016c68..0cc8e8acf545 100644 - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; -- -- /* -- * While actively printing out messages, if another printk() ++ /* Were there any consoles available for flushing? */ ++ if (!flushed) ++ break; + + /* +- * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. @@ -7056,7 +6203,11 @@ index 99221b016c68..0cc8e8acf545 100644 - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). -- */ ++ * Some context may have added new records after ++ * console_flush_all() but before unlocking the console. ++ * Re-check if there is a new record to flush. If the trylock ++ * fails, another context is already handling the printing. + */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - @@ -7076,8 +6227,8 @@ index 99221b016c68..0cc8e8acf545 100644 - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; - - console_locked = 0; - up_console_sem(); +- console_locked = 0; +- up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's @@ -7088,71 +6239,74 @@ index 99221b016c68..0cc8e8acf545 100644 - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && console_trylock()) - goto again; ++ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } EXPORT_SYMBOL(console_unlock); -@@ -2783,18 +2685,20 @@ void console_unblank(void) +@@ -2765,10 +3145,15 @@ void console_unblank(void) + if (oops_in_progress) { + if (down_trylock_console_sem() != 0) + return; +- } else ++ if (!console_excl_trylock()) { ++ up_console_sem(); ++ return; ++ } ++ } else { ++ pr_flush(1000, true); + console_lock(); ++ } + +- console_locked = 1; + console_may_schedule = 0; + for_each_console(c) + if ((c->flags & CON_ENABLED) && c->unblank) +@@ -2784,6 +3169,11 @@ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { -- /* -- * If someone else is holding the console lock, trylock will fail -- * and may_schedule may be set. Ignore and proceed to unlock so -- * that messages are flushed out. As this can be called from any -- * context and we don't want to get preempted while flushing, -- * ensure may_schedule is cleared. -- */ -- console_trylock(); -- console_may_schedule = 0; -+ if (!console_trylock()) ++ if (mode == CONSOLE_ATOMIC_FLUSH_PENDING) { ++ atomic_console_flush_all(); + return; ++ } + -+#ifdef CONFIG_PRINTK + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so +@@ -2794,8 +3184,14 @@ void console_flush_on_panic(enum con_flush_mode mode) + console_trylock(); + console_may_schedule = 0; + +- if (mode == CONSOLE_REPLAY_ALL) +- console_seq = prb_first_valid_seq(prb); + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) -+ latched_seq_write(&c->printk_seq, seq); ++ write_console_seq(c, seq, false); + } -+#endif - -- if (mode == CONSOLE_REPLAY_ALL) -- console_seq = prb_first_valid_seq(prb); console_unlock(); } -@@ -2930,6 +2834,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) - void register_console(struct console *newcon) +@@ -2826,6 +3222,7 @@ struct tty_driver *console_device(int *index) + */ + void console_stop(struct console *console) { - struct console *bcon = NULL; -+ u64 __maybe_unused seq = 0; - int err; - - for_each_console(bcon) { -@@ -2952,6 +2857,8 @@ void register_console(struct console *newcon) - } - } - -+ newcon->thread = NULL; -+ - if (console_drivers && console_drivers->flags & CON_BOOT) - bcon = console_drivers; - -@@ -2993,8 +2900,10 @@ void register_console(struct console *newcon) - * the real console are the same physical device, it's annoying to - * see the beginning boot messages twice - */ -- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) -+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { - newcon->flags &= ~CON_PRINTBUFFER; -+ newcon->flags |= CON_HANDOVER; -+ } ++ pr_flush(1000, true); + console_lock(); + console->flags &= ~CON_ENABLED; + console_unlock(); +@@ -2837,6 +3234,7 @@ void console_start(struct console *console) + console_lock(); + console->flags |= CON_ENABLED; + console_unlock(); ++ pr_flush(1000, true); + } + EXPORT_SYMBOL(console_start); - /* - * Put this console in the list - keep the -@@ -3016,27 +2925,21 @@ void register_console(struct console *newcon) +@@ -3017,27 +3415,25 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; @@ -7171,213 +6325,68 @@ index 99221b016c68..0cc8e8acf545 100644 - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; -+#ifdef CONFIG_PRINTK -+ if (!(newcon->flags & CON_PRINTBUFFER)) -+ seq = prb_next_seq(prb); ++ if (consoles_paused) ++ newcon->flags |= CON_PAUSED; -- /* Get a consistent copy of @syslog_seq. */ -- mutex_lock(&syslog_lock); -- console_seq = syslog_seq; -- mutex_unlock(&syslog_lock); -- } -+ seqcount_latch_init(&newcon->printk_seq.latch); -+ latched_seq_write(&newcon->printk_seq, seq); -+ seqcount_latch_init(&newcon->printk_sync_seq.latch); -+ latched_seq_write(&newcon->printk_sync_seq, seq); -+#ifdef CONFIG_HAVE_NMI -+ seqcount_latch_init(&newcon->printk_sync_nmi_seq.latch); -+ latched_seq_write(&newcon->printk_sync_nmi_seq, seq); ++ atomic_long_set(&newcon->dropped, 0); ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++ newcon->atomic_data = NULL; +#endif -+ -+ console_try_thread(newcon); -+#endif /* CONFIG_PRINTK */ ++ mutex_init(&newcon->lock); ++ if (newcon->flags & CON_PRINTBUFFER) { + /* Get a consistent copy of @syslog_seq. */ + mutex_lock(&syslog_lock); +- console_seq = syslog_seq; ++ write_console_seq(newcon, syslog_seq, false); + mutex_unlock(&syslog_lock); ++ } else { ++ /* Begin with next message. */ ++ write_console_seq(newcon, prb_next_seq(prb), false); + } ++ if (kthreads_started) ++ start_printk_kthread(newcon); console_unlock(); console_sysfs_notify(); -@@ -3110,6 +3013,9 @@ int unregister_console(struct console *console) +@@ -3094,6 +3490,11 @@ int unregister_console(struct console *console) + } + } + ++ if (console->thread) { ++ kthread_stop(console->thread); ++ console->thread = NULL; ++ } ++ + if (res) + goto out_disable_unlock; + +@@ -3111,6 +3512,10 @@ int unregister_console(struct console *console) console_unlock(); console_sysfs_notify(); -+ if (console->thread && !IS_ERR(console->thread)) -+ kthread_stop(console->thread); ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++ free_atomic_data(console->atomic_data); ++#endif + if (console->exit) res = console->exit(console); -@@ -3192,6 +3098,15 @@ static int __init printk_late_init(void) - unregister_console(con); - } - } +@@ -3199,11 +3604,205 @@ static int __init printk_late_init(void) + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", + console_cpu_notify, NULL); + WARN_ON(ret < 0); + -+#ifdef CONFIG_PRINTK + console_lock(); + for_each_console(con) + start_printk_kthread(con); + kthreads_started = true; + console_unlock(); -+#endif + - ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, - console_cpu_notify); - WARN_ON(ret < 0); -@@ -3207,7 +3122,6 @@ late_initcall(printk_late_init); - * Delayed printk version, for scheduler-internal messages: - */ - #define PRINTK_PENDING_WAKEUP 0x01 --#define PRINTK_PENDING_OUTPUT 0x02 - - static DEFINE_PER_CPU(int, printk_pending); - -@@ -3215,14 +3129,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) - { - int pending = __this_cpu_xchg(printk_pending, 0); - -- if (pending & PRINTK_PENDING_OUTPUT) { -- /* If trylock fails, someone else is doing the printing */ -- if (console_trylock()) -- console_unlock(); -- } -- - if (pending & PRINTK_PENDING_WAKEUP) -- wake_up_interruptible(&log_wait); -+ wake_up_interruptible_all(&log_wait); - } - - static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = -@@ -3241,42 +3149,9 @@ void wake_up_klogd(void) - preempt_enable(); - } - --void defer_console_output(void) --{ -- if (!printk_percpu_data_ready()) -- return; -- -- preempt_disable(); -- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); -- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); -- preempt_enable(); --} -- - void printk_trigger_flush(void) - { -- defer_console_output(); --} -- --int vprintk_deferred(const char *fmt, va_list args) --{ -- int r; -- -- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); -- defer_console_output(); -- -- return r; --} -- --int _printk_deferred(const char *fmt, ...) --{ -- va_list args; -- int r; -- -- va_start(args, fmt); -- r = vprintk_deferred(fmt, args); -- va_end(args); -- -- return r; -+ wake_up_klogd(); + return 0; } + late_initcall(printk_late_init); - /* -@@ -3405,6 +3280,24 @@ void kmsg_dump(enum kmsg_dump_reason reason) - { - struct kmsg_dumper *dumper; - -+ if (!oops_in_progress) { -+ /* -+ * If atomic consoles are available, activate kernel sync mode -+ * to make sure any final messages are visible. The trailing -+ * printk message is important to flush any pending messages. -+ */ -+ if (have_atomic_console()) { -+ sync_mode = true; -+ pr_info("enabled sync mode\n"); -+ } -+ -+ /* -+ * Give the printing threads time to flush, allowing up to -+ * 1s of no printing forward progress before giving up. -+ */ -+ pr_flush(1000, true); -+ } -+ - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) { - enum kmsg_dump_reason max_reason = dumper->max_reason; -@@ -3587,6 +3480,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); - #ifdef CONFIG_SMP - static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); - static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); -+static unsigned int kgdb_cpu = -1; - - /** - * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant -@@ -3666,6 +3560,9 @@ EXPORT_SYMBOL(__printk_cpu_trylock); - */ - void __printk_cpu_unlock(void) - { -+ bool trigger_kgdb = false; -+ unsigned int cpu; -+ - if (atomic_read(&printk_cpulock_nested)) { - atomic_dec(&printk_cpulock_nested); - return; -@@ -3676,6 +3573,12 @@ void __printk_cpu_unlock(void) - * LMM(__printk_cpu_unlock:A) - */ - -+ cpu = smp_processor_id(); -+ if (kgdb_cpu == cpu) { -+ trigger_kgdb = true; -+ kgdb_cpu = -1; -+ } -+ - /* - * Guarantee loads and stores from this CPU when it was the - * lock owner are visible to the next lock owner. This pairs -@@ -3696,6 +3599,98 @@ void __printk_cpu_unlock(void) - */ - atomic_set_release(&printk_cpulock_owner, - -1); /* LMM(__printk_cpu_unlock:B) */ -+ -+ if (trigger_kgdb) { -+ pr_warn("re-triggering kgdb roundup for CPU#%d\n", cpu); -+ kgdb_roundup_cpu(cpu); -+ } - } - EXPORT_SYMBOL(__printk_cpu_unlock); -+ -+bool kgdb_roundup_delay(unsigned int cpu) -+{ -+ if (cpu != atomic_read(&printk_cpulock_owner)) -+ return false; -+ -+ kgdb_cpu = cpu; -+ return true; -+} -+EXPORT_SYMBOL(kgdb_roundup_delay); - #endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_PRINTK -+static void pr_msleep(bool may_sleep, int ms) -+{ -+ if (may_sleep) { -+ msleep(ms); -+ } else { -+ while (ms--) -+ udelay(1000); -+ } -+} -+ + #if defined CONFIG_PRINTK +/** + * pr_flush() - Wait for printing threads to catch up. + * @@ -7390,7 +6399,7 @@ index 99221b016c68..0cc8e8acf545 100644 + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * -+ * Context: Any context. ++ * Context: Process context. May sleep while acquiring console lock. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) @@ -7398,27 +6407,26 @@ index 99221b016c68..0cc8e8acf545 100644 + int remaining = timeout_ms; + struct console *con; + u64 last_diff = 0; -+ bool may_sleep; + u64 printk_seq; + u64 diff; + u64 seq; + -+ may_sleep = (preemptible() && -+ !in_softirq() && -+ system_state >= SYSTEM_RUNNING); ++ might_sleep(); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + ++ console_lock(); + for_each_console(con) { -+ if (!(con->flags & CON_ENABLED)) ++ if (!console_is_usable(con, false)) + continue; -+ printk_seq = read_console_seq(con); ++ printk_seq = con->seq; + if (printk_seq < seq) + diff += seq - printk_seq; + } ++ console_unlock(); + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; @@ -7427,12 +6435,13 @@ index 99221b016c68..0cc8e8acf545 100644 + break; + + if (remaining < 0) { -+ pr_msleep(may_sleep, 100); ++ /* no timeout limit */ ++ msleep(100); + } else if (remaining < 100) { -+ pr_msleep(may_sleep, remaining); ++ msleep(remaining); + remaining = 0; + } else { -+ pr_msleep(may_sleep, 100); ++ msleep(100); + remaining -= 100; + } + @@ -7442,143 +6451,370 @@ index 99221b016c68..0cc8e8acf545 100644 + return (diff == 0); +} +EXPORT_SYMBOL(pr_flush); -+#endif /* CONFIG_PRINTK */ -diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c -deleted file mode 100644 -index ef0f9a2044da..000000000000 ---- a/kernel/printk/printk_safe.c -+++ /dev/null -@@ -1,52 +0,0 @@ --// SPDX-License-Identifier: GPL-2.0-or-later --/* -- * printk_safe.c - Safe printk for printk-deadlock-prone contexts -- */ -- --#include <linux/preempt.h> --#include <linux/kdb.h> --#include <linux/smp.h> --#include <linux/cpumask.h> --#include <linux/printk.h> --#include <linux/kprobes.h> -- --#include "internal.h" -- --static DEFINE_PER_CPU(int, printk_context); -- --/* Can be preempted by NMI. */ --void __printk_safe_enter(void) --{ -- this_cpu_inc(printk_context); --} -- --/* Can be preempted by NMI. */ --void __printk_safe_exit(void) --{ -- this_cpu_dec(printk_context); --} -- --asmlinkage int vprintk(const char *fmt, va_list args) --{ --#ifdef CONFIG_KGDB_KDB -- /* Allow to pass printk() to kdb but avoid a recursion. */ -- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) -- return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); --#endif -- -- /* -- * Use the main logbuf even in NMI. But avoid calling console -- * drivers that might have their own locks. -- */ -- if (this_cpu_read(printk_context) || in_nmi()) { -- int len; -- -- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); -- defer_console_output(); -- return len; -- } -- -- /* No obstacles. */ -- return vprintk_default(fmt, args); --} --EXPORT_SYMBOL(vprintk); -diff --git a/kernel/ptrace.c b/kernel/ptrace.c -index f8589bf8d7dc..df08e8e64a83 100644 ---- a/kernel/ptrace.c -+++ b/kernel/ptrace.c -@@ -197,7 +197,18 @@ static bool ptrace_freeze_traced(struct task_struct *task) - spin_lock_irq(&task->sighand->siglock); - if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && - !__fatal_signal_pending(task)) { -+#ifdef CONFIG_PREEMPT_RT -+ unsigned long flags; + -+ raw_spin_lock_irqsave(&task->pi_lock, flags); -+ if (READ_ONCE(task->__state) & __TASK_TRACED) -+ WRITE_ONCE(task->__state, __TASK_TRACED); -+ else -+ task->saved_state = __TASK_TRACED; -+ raw_spin_unlock_irqrestore(&task->pi_lock, flags); -+#else - WRITE_ONCE(task->__state, __TASK_TRACED); -+#endif - ret = true; - } - spin_unlock_irq(&task->sighand->siglock); -@@ -207,7 +218,11 @@ static bool ptrace_freeze_traced(struct task_struct *task) - - static void ptrace_unfreeze_traced(struct task_struct *task) - { -- if (READ_ONCE(task->__state) != __TASK_TRACED) -+ unsigned long flags; -+ bool frozen = true; ++static bool printer_should_wake(struct console *con, u64 seq) ++{ ++ short flags; + -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && -+ READ_ONCE(task->__state) != __TASK_TRACED) - return; - - WARN_ON(!task->ptrace || task->parent != current); -@@ -217,12 +232,21 @@ static void ptrace_unfreeze_traced(struct task_struct *task) - * Recheck state under the lock to close this race. - */ - spin_lock_irq(&task->sighand->siglock); -- if (READ_ONCE(task->__state) == __TASK_TRACED) { -- if (__fatal_signal_pending(task)) -- wake_up_state(task, __TASK_TRACED); -- else -- WRITE_ONCE(task->__state, TASK_TRACED); -- } -+ raw_spin_lock_irqsave(&task->pi_lock, flags); -+ if (READ_ONCE(task->__state) == __TASK_TRACED) -+ WRITE_ONCE(task->__state, TASK_TRACED); ++ if (kthread_should_stop()) ++ return true; + -+#ifdef CONFIG_PREEMPT_RT -+ else if (task->saved_state == __TASK_TRACED) -+ task->saved_state = TASK_TRACED; ++ /* ++ * This is an unsafe read to con->flags, but false positives ++ * are not an issue as long as they are rare. ++ */ ++ flags = data_race(READ_ONCE(con->flags)); ++ ++ if (!(flags & CON_ENABLED) || ++ (flags & CON_PAUSED) || ++ atomic_read(&console_lock_count) == -1) { ++ return false; ++ } ++ ++ return prb_read_valid(prb, seq, NULL); ++} ++ ++static int printk_kthread_func(void *data) ++{ ++ struct console *con = data; ++ char *dropped_text = NULL; ++ char *ext_text = NULL; ++ bool progress; ++ u64 seq = 0; ++ char *text; ++ int error; ++ ++ pr_info("%sconsole [%s%d]: printing thread started\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ ++#ifdef CONFIG_HAVE_ATOMIC_CONSOLE ++ if (con->write_atomic) ++ con->atomic_data = alloc_atomic_data(con->flags); +#endif -+ else -+ frozen = false; -+ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + -+ if (frozen && __fatal_signal_pending(task)) -+ wake_up_state(task, __TASK_TRACED); ++ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); ++ if (!text) ++ goto out; + - spin_unlock_irq(&task->sighand->siglock); - } - -diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h -index 6591914af486..a404897d826f 100644 ---- a/kernel/rcu/tasks.h -+++ b/kernel/rcu/tasks.h -@@ -1347,7 +1347,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) - rttd->notrun = true; - } - --static void rcu_tasks_initiate_self_tests(void) -+void rcu_tasks_initiate_self_tests(void) ++ if (con->flags & CON_EXTENDED) { ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ if (!ext_text) ++ goto out; ++ } else { ++ dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); ++ if (!dropped_text) ++ goto out; ++ } ++ ++ for (;;) { ++ error = wait_event_interruptible(log_wait, printer_should_wake(con, seq)); ++ ++ if (kthread_should_stop()) ++ break; ++ ++ if (error) ++ continue; ++ ++ do { ++ error = mutex_lock_interruptible(&con->lock); ++ if (error) ++ break; ++ ++ if (!console_is_usable(con, false)) { ++ mutex_unlock(&con->lock); ++ break; ++ } ++ ++ if ((con->flags & CON_PAUSED) || !console_printer_tryenter()) { ++ mutex_unlock(&con->lock); ++ break; ++ } ++ ++ /* ++ * Even though the printk kthread is always preemptible, it is ++ * still not allowed to call cond_resched() from within ++ * console drivers. The task may become non-preemptible in the ++ * console driver call chain. For example, vt_console_print() ++ * takes a spinlock and then can call into fbcon_redraw(), ++ * which can conditionally invoke cond_resched(). ++ */ ++ console_may_schedule = 0; ++ progress = console_emit_next_record(con, text, ext_text, ++ dropped_text, false, NULL); ++ ++ seq = con->seq; ++ ++ console_printer_exit(); ++ ++ mutex_unlock(&con->lock); ++ } while (progress); ++ } ++out: ++ kfree(dropped_text); ++ kfree(ext_text); ++ kfree(text); ++ pr_info("%sconsole [%s%d]: printing thread stopped\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return 0; ++} ++ ++/* Must be called within console_lock(). */ ++static void start_printk_kthread(struct console *con) ++{ ++ con->thread = kthread_run(printk_kthread_func, con, ++ "pr/%s%d", con->name, con->index); ++ if (IS_ERR(con->thread)) { ++ con->thread = NULL; ++ pr_err("%sconsole [%s%d]: unable to start printing thread\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return; ++ } ++} ++ + /* + * Delayed printk version, for scheduler-internal messages: + */ +@@ -3223,7 +3822,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) + } + + if (pending & PRINTK_PENDING_WAKEUP) +- wake_up_interruptible(&log_wait); ++ wake_up_interruptible_all(&log_wait); + } + + static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = +@@ -3586,26 +4185,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + #endif + + #ifdef CONFIG_SMP +-static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); +-static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); ++static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); ++static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); + + /** +- * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant +- * spinning lock is not owned by any CPU. ++ * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant ++ * spinning lock is not owned by any CPU. + * + * Context: Any context. + */ +-void __printk_wait_on_cpu_lock(void) ++void __printk_cpu_sync_wait(void) + { + do { + cpu_relax(); +- } while (atomic_read(&printk_cpulock_owner) != -1); ++ } while (atomic_read(&printk_cpu_sync_owner) != -1); + } +-EXPORT_SYMBOL(__printk_wait_on_cpu_lock); ++EXPORT_SYMBOL(__printk_cpu_sync_wait); + + /** +- * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant +- * spinning lock. ++ * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant ++ * spinning lock. + * + * If no processor has the lock, the calling processor takes the lock and + * becomes the owner. If the calling processor is already the owner of the +@@ -3614,7 +4213,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock); + * Context: Any context. Expects interrupts to be disabled. + * Return: 1 on success, otherwise 0. + */ +-int __printk_cpu_trylock(void) ++int __printk_cpu_sync_try_get(void) + { + int cpu; + int old; +@@ -3624,79 +4223,80 @@ int __printk_cpu_trylock(void) + /* + * Guarantee loads and stores from this CPU when it is the lock owner + * are _not_ visible to the previous lock owner. This pairs with +- * __printk_cpu_unlock:B. ++ * __printk_cpu_sync_put:B. + * + * Memory barrier involvement: + * +- * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then +- * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. ++ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, ++ * then __printk_cpu_sync_put:A can never read from ++ * __printk_cpu_sync_try_get:B. + * + * Relies on: + * +- * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B ++ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B + * of the previous CPU + * matching +- * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B +- * of this CPU ++ * ACQUIRE from __printk_cpu_sync_try_get:A to ++ * __printk_cpu_sync_try_get:B of this CPU + */ +- old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, +- cpu); /* LMM(__printk_cpu_trylock:A) */ ++ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, ++ cpu); /* LMM(__printk_cpu_sync_try_get:A) */ + if (old == -1) { + /* + * This CPU is now the owner and begins loading/storing +- * data: LMM(__printk_cpu_trylock:B) ++ * data: LMM(__printk_cpu_sync_try_get:B) + */ + return 1; + + } else if (old == cpu) { + /* This CPU is already the owner. */ +- atomic_inc(&printk_cpulock_nested); ++ atomic_inc(&printk_cpu_sync_nested); + return 1; + } + + return 0; + } +-EXPORT_SYMBOL(__printk_cpu_trylock); ++EXPORT_SYMBOL(__printk_cpu_sync_try_get); + + /** +- * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. ++ * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. + * + * The calling processor must be the owner of the lock. + * + * Context: Any context. Expects interrupts to be disabled. + */ +-void __printk_cpu_unlock(void) ++void __printk_cpu_sync_put(void) + { +- if (atomic_read(&printk_cpulock_nested)) { +- atomic_dec(&printk_cpulock_nested); ++ if (atomic_read(&printk_cpu_sync_nested)) { ++ atomic_dec(&printk_cpu_sync_nested); + return; + } + + /* + * This CPU is finished loading/storing data: +- * LMM(__printk_cpu_unlock:A) ++ * LMM(__printk_cpu_sync_put:A) + */ + + /* + * Guarantee loads and stores from this CPU when it was the + * lock owner are visible to the next lock owner. This pairs +- * with __printk_cpu_trylock:A. ++ * with __printk_cpu_sync_try_get:A. + * + * Memory barrier involvement: + * +- * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, +- * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. ++ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, ++ * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. + * + * Relies on: + * +- * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B ++ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B + * of this CPU + * matching +- * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B +- * of the next CPU ++ * ACQUIRE from __printk_cpu_sync_try_get:A to ++ * __printk_cpu_sync_try_get:B of the next CPU + */ +- atomic_set_release(&printk_cpulock_owner, +- -1); /* LMM(__printk_cpu_unlock:B) */ ++ atomic_set_release(&printk_cpu_sync_owner, ++ -1); /* LMM(__printk_cpu_sync_put:B) */ + } +-EXPORT_SYMBOL(__printk_cpu_unlock); ++EXPORT_SYMBOL(__printk_cpu_sync_put); + #endif /* CONFIG_SMP */ +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index f8589bf8d7dc..df08e8e64a83 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -197,7 +197,18 @@ static bool ptrace_freeze_traced(struct task_struct *task) + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { ++#ifdef CONFIG_PREEMPT_RT ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (READ_ONCE(task->__state) & __TASK_TRACED) ++ WRITE_ONCE(task->__state, __TASK_TRACED); ++ else ++ task->saved_state = __TASK_TRACED; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++#else + WRITE_ONCE(task->__state, __TASK_TRACED); ++#endif + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); +@@ -207,7 +218,11 @@ static bool ptrace_freeze_traced(struct task_struct *task) + + static void ptrace_unfreeze_traced(struct task_struct *task) + { +- if (READ_ONCE(task->__state) != __TASK_TRACED) ++ unsigned long flags; ++ bool frozen = true; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ++ READ_ONCE(task->__state) != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); +@@ -217,12 +232,21 @@ static void ptrace_unfreeze_traced(struct task_struct *task) + * Recheck state under the lock to close this race. + */ + spin_lock_irq(&task->sighand->siglock); +- if (READ_ONCE(task->__state) == __TASK_TRACED) { +- if (__fatal_signal_pending(task)) +- wake_up_state(task, __TASK_TRACED); +- else +- WRITE_ONCE(task->__state, TASK_TRACED); +- } ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (READ_ONCE(task->__state) == __TASK_TRACED) ++ WRITE_ONCE(task->__state, TASK_TRACED); ++ ++#ifdef CONFIG_PREEMPT_RT ++ else if (task->saved_state == __TASK_TRACED) ++ task->saved_state = TASK_TRACED; ++#endif ++ else ++ frozen = false; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ ++ if (frozen && __fatal_signal_pending(task)) ++ wake_up_state(task, __TASK_TRACED); ++ + spin_unlock_irq(&task->sighand->siglock); + } + +diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h +index 7da3c81c3f59..7f9d3df35854 100644 +--- a/kernel/rcu/tasks.h ++++ b/kernel/rcu/tasks.h +@@ -1345,7 +1345,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) + rttd->notrun = true; + } + +-static void rcu_tasks_initiate_self_tests(void) ++void rcu_tasks_initiate_self_tests(void) { pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU -@@ -1384,9 +1384,7 @@ static int rcu_tasks_verify_self_tests(void) +@@ -1382,9 +1382,7 @@ static int rcu_tasks_verify_self_tests(void) return ret; } late_initcall(rcu_tasks_verify_self_tests); @@ -7589,7 +6825,7 @@ index 6591914af486..a404897d826f 100644 void __init rcu_init_tasks_generic(void) { -@@ -1401,9 +1399,6 @@ void __init rcu_init_tasks_generic(void) +@@ -1399,9 +1397,6 @@ void __init rcu_init_tasks_generic(void) #ifdef CONFIG_TASKS_TRACE_RCU rcu_spawn_tasks_trace_kthread(); #endif @@ -7600,10 +6836,10 @@ index 6591914af486..a404897d826f 100644 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index bdd1dc6de71a..9a04550cc54b 100644 +index ef8d36f580fc..44fb12fc7b82 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c -@@ -2278,13 +2278,13 @@ rcu_report_qs_rdp(struct rcu_data *rdp) +@@ -2276,13 +2276,13 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -7619,7 +6855,7 @@ index bdd1dc6de71a..9a04550cc54b 100644 if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || rdp->gpwrap) { -@@ -2446,7 +2446,7 @@ static void rcu_do_batch(struct rcu_data *rdp) +@@ -2444,7 +2444,7 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; @@ -7628,7 +6864,7 @@ index bdd1dc6de71a..9a04550cc54b 100644 struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; -@@ -2472,6 +2472,7 @@ static void rcu_do_batch(struct rcu_data *rdp) +@@ -2470,6 +2470,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); @@ -7637,22 +6873,10 @@ index bdd1dc6de71a..9a04550cc54b 100644 div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 0d12ec7be301..39adf3a8067b 100644 +index 77563109c0ea..04165fa6ff25 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -74,7 +74,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ -+#ifdef CONFIG_PREEMPT_RT -+const_debug unsigned int sysctl_sched_nr_migrate = 8; -+#else - const_debug unsigned int sysctl_sched_nr_migrate = 32; -+#endif - - /* - * period over which we measure -rt task CPU usage in us. -@@ -982,6 +986,46 @@ void resched_curr(struct rq *rq) +@@ -986,6 +986,46 @@ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } @@ -7699,7 +6923,7 @@ index 0d12ec7be301..39adf3a8067b 100644 void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); -@@ -2137,6 +2181,7 @@ void migrate_disable(void) +@@ -2160,6 +2200,7 @@ void migrate_disable(void) preempt_disable(); this_rq()->nr_pinned++; p->migration_disabled = 1; @@ -7707,7 +6931,7 @@ index 0d12ec7be301..39adf3a8067b 100644 preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); -@@ -2148,6 +2193,8 @@ void migrate_enable(void) +@@ -2171,6 +2212,8 @@ void migrate_enable(void) if (p->migration_disabled > 1) { p->migration_disabled--; return; @@ -7716,7 +6940,7 @@ index 0d12ec7be301..39adf3a8067b 100644 } /* -@@ -2165,6 +2212,7 @@ void migrate_enable(void) +@@ -2188,6 +2231,7 @@ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; @@ -7724,19 +6948,7 @@ index 0d12ec7be301..39adf3a8067b 100644 preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); -@@ -2944,9 +2992,8 @@ void force_compatible_cpus_allowed_ptr(struct task_struct *p) - - out_set_mask: - if (printk_ratelimit()) { -- printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", -- task_pid_nr(p), p->comm, -- cpumask_pr_args(override_mask)); -+ printk("Overriding affinity for process %d (%s) to CPUs %*pbl\n", -+ task_pid_nr(p), p->comm, cpumask_pr_args(override_mask)); - } - - WARN_ON(set_cpus_allowed_ptr(p, override_mask)); -@@ -3202,7 +3249,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3225,7 +3269,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * is actually now running somewhere else! */ while (task_running(rq, p)) { @@ -7745,7 +6957,7 @@ index 0d12ec7be301..39adf3a8067b 100644 return 0; cpu_relax(); } -@@ -3217,7 +3264,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3240,7 +3284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; @@ -7754,27 +6966,7 @@ index 0d12ec7be301..39adf3a8067b 100644 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); -@@ -3251,7 +3298,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state - ktime_t to = NSEC_PER_SEC / HZ; - - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); - continue; - } - -@@ -3376,8 +3423,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { -- printk_deferred("process %d (%s) no longer affine to cpu%d\n", -- task_pid_nr(p), p->comm, cpu); -+ printk("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); - } - } - -@@ -4384,6 +4431,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4418,6 +4462,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->on_cpu = 0; #endif init_task_preempt_count(p); @@ -7784,33 +6976,21 @@ index 0d12ec7be301..39adf3a8067b 100644 #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -@@ -4840,20 +4890,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) - */ - if (mm) { - membarrier_mm_sync_core_before_usermode(mm); -- mmdrop(mm); -+ mmdrop_sched(mm); - } - if (unlikely(prev_state == TASK_DEAD)) { +@@ -4880,8 +4927,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* -- * Remove function-return probe instances associated with this -- * task and put them back on the free list. -+ * Release VMAP'ed task stack immediate for reuse. On RT -+ * enabled kernels this is delayed for latency reasons. - */ -- kprobe_flush_task(prev); -- - /* Task is done with its stack. */ - put_task_stack(prev); -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) -+ put_task_stack(prev); ++ /* ++ * Cache only the VMAP stack. The final deallocation is in ++ * delayed_put_task_struct. ++ */ ++ put_task_stack_sched(prev); put_task_struct_rcu_user(prev); } -@@ -6254,6 +6302,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) +@@ -6216,6 +6266,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); @@ -7818,7 +6998,7 @@ index 0d12ec7be301..39adf3a8067b 100644 clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -@@ -6471,6 +6520,30 @@ static void __sched notrace preempt_schedule_common(void) +@@ -6427,6 +6478,30 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } @@ -7849,7 +7029,7 @@ index 0d12ec7be301..39adf3a8067b 100644 #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption -@@ -6484,7 +6557,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) +@@ -6440,7 +6515,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; @@ -7859,7 +7039,7 @@ index 0d12ec7be301..39adf3a8067b 100644 preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); -@@ -6517,6 +6591,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) +@@ -6473,6 +6549,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) if (likely(!preemptible())) return; @@ -7869,7 +7049,7 @@ index 0d12ec7be301..39adf3a8067b 100644 do { /* * Because the function tracer can trace preempt_count_sub() -@@ -8675,7 +8752,9 @@ void __init init_idle(struct task_struct *idle, int cpu) +@@ -8653,7 +8732,9 @@ void __init init_idle(struct task_struct *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -7880,144 +7060,11 @@ index 0d12ec7be301..39adf3a8067b 100644 /* * The idle tasks have their own, simple scheduling class: */ -@@ -9469,14 +9548,8 @@ void __init sched_init(void) - } - - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP --static inline int preempt_count_equals(int preempt_offset) --{ -- int nested = preempt_count() + rcu_preempt_depth(); -- -- return (nested == preempt_offset); --} - --void __might_sleep(const char *file, int line, int preempt_offset) -+void __might_sleep(const char *file, int line) - { - unsigned int state = get_current_state(); - /* -@@ -9490,11 +9563,32 @@ void __might_sleep(const char *file, int line, int preempt_offset) - (void *)current->task_state_change, - (void *)current->task_state_change); - -- ___might_sleep(file, line, preempt_offset); -+ __might_resched(file, line, 0); - } - EXPORT_SYMBOL(__might_sleep); - --void ___might_sleep(const char *file, int line, int preempt_offset) -+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) -+{ -+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) -+ return; -+ -+ if (preempt_count() == preempt_offset) -+ return; -+ -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, ip); -+} -+ -+static inline bool resched_offsets_ok(unsigned int offsets) -+{ -+ unsigned int nested = preempt_count(); -+ -+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; -+ -+ return nested == offsets; -+} -+ -+void __might_resched(const char *file, int line, unsigned int offsets) - { - /* Ratelimiting timestamp: */ - static unsigned long prev_jiffy; -@@ -9504,7 +9598,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) - /* WARN_ON_ONCE() by default, no rate limit required: */ - rcu_sleep_check(); - -- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ if ((resched_offsets_ok(offsets) && !irqs_disabled() && - !is_idle_task(current) && !current->non_block_count) || - system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || - oops_in_progress) -@@ -9517,29 +9611,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset) - /* Save this before calling printk(), since that will clobber it: */ - preempt_disable_ip = get_preempt_disable_ip(current); - -- printk(KERN_ERR -- "BUG: sleeping function called from invalid context at %s:%d\n", -- file, line); -- printk(KERN_ERR -- "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -- in_atomic(), irqs_disabled(), current->non_block_count, -- current->pid, current->comm); -+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(), -+ offsets & MIGHT_RESCHED_PREEMPT_MASK); -+ -+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { -+ pr_err("RCU nest depth: %d, expected: %u\n", -+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); -+ } - - if (task_stack_end_corrupted(current)) -- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ pr_emerg("Thread overran stack, or stack corrupted\n"); - - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); -- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -- && !preempt_count_equals(preempt_offset)) { -- pr_err("Preemption disabled at:"); -- print_ip_sym(KERN_ERR, preempt_disable_ip); -- } -+ -+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, -+ preempt_disable_ip); -+ - dump_stack(); - add_taint(TAINT_WARN, LOCKDEP_STILL_OK); - } --EXPORT_SYMBOL(___might_sleep); -+EXPORT_SYMBOL(__might_resched); - - void __cant_sleep(const char *file, int line, int preempt_offset) - { -diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c -index e94314633b39..fd7c4f972aaf 100644 ---- a/kernel/sched/deadline.c -+++ b/kernel/sched/deadline.c -@@ -800,7 +800,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) - * entity. - */ - if (dl_time_before(dl_se->deadline, rq_clock(rq))) { -- printk_deferred_once("sched: DL replenish lagged too much\n"); -+ printk_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; - } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 6f16dfb74246..f2d0fb3ac43f 100644 +index 6e476f6d9435..22a0d3a8c760 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -4237,10 +4237,7 @@ static inline void check_schedstat_required(void) - trace_sched_stat_iowait_enabled() || - trace_sched_stat_blocked_enabled() || - trace_sched_stat_runtime_enabled()) { -- printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " -- "stat_blocked and stat_runtime require the " -- "kernel parameter schedstats=enable or " -- "kernel.sched_schedstats=1\n"); -+ printk_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n"); - } - #endif - } -@@ -4448,7 +4445,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +@@ -4393,7 +4393,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { @@ -8026,7 +7073,7 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. -@@ -4472,7 +4469,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +@@ -4417,7 +4417,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) @@ -8035,7 +7082,7 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 } static void -@@ -4615,7 +4612,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +@@ -4563,7 +4563,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { @@ -8044,7 +7091,7 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 return; } /* -@@ -4755,7 +4752,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +@@ -4712,7 +4712,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) @@ -8053,7 +7100,7 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 } static __always_inline -@@ -5518,7 +5515,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +@@ -5475,7 +5475,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (task_current(rq, p)) @@ -8062,7 +7109,7 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 return; } hrtick_start(rq, delta); -@@ -7208,7 +7205,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7172,7 +7172,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: @@ -8071,7 +7118,7 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved -@@ -11109,7 +11106,7 @@ static void task_fork_fair(struct task_struct *p) +@@ -11207,7 +11207,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); @@ -8080,7 +7127,7 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 } se->vruntime -= cfs_rq->min_vruntime; -@@ -11136,7 +11133,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +@@ -11234,7 +11234,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (task_current(rq, p)) { if (p->prio > oldprio) @@ -8090,79 +7137,24 @@ index 6f16dfb74246..f2d0fb3ac43f 100644 check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 7f8dace0964c..d5cee51819bf 100644 +index 1cf435bbcd9c..d5cee51819bf 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h -@@ -46,11 +46,19 @@ SCHED_FEAT(DOUBLE_TICK, false) - */ - SCHED_FEAT(NONTASK_CAPACITY, true) +@@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true) -+#ifdef CONFIG_PREEMPT_RT -+SCHED_FEAT(TTWU_QUEUE, false) + #ifdef CONFIG_PREEMPT_RT + SCHED_FEAT(TTWU_QUEUE, false) +# ifdef CONFIG_PREEMPT_LAZY +SCHED_FEAT(PREEMPT_LAZY, true) +# endif -+#else -+ - /* - * Queue remote wakeups on the target CPU and process them - * using the scheduler IPI. Reduces rq->lock contention/bounces. - */ - SCHED_FEAT(TTWU_QUEUE, true) -+#endif + #else /* - * When doing wakeups, attempt to limit superfluous scans of the LLC domain. -diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c -index 1652f2bb54b7..ab54b2012469 100644 ---- a/kernel/sched/psi.c -+++ b/kernel/sched/psi.c -@@ -710,10 +710,10 @@ static void psi_group_change(struct psi_group *group, int cpu, - if (groupc->tasks[t]) { - groupc->tasks[t]--; - } else if (!psi_bug) { -- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", -- cpu, t, groupc->tasks[0], -- groupc->tasks[1], groupc->tasks[2], -- groupc->tasks[3], clear, set); -+ pr_err("psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", -+ cpu, t, groupc->tasks[0], -+ groupc->tasks[1], groupc->tasks[2], -+ groupc->tasks[3], clear, set); - psi_bug = 1; - } - } -@@ -779,9 +779,9 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) - if (((task->psi_flags & set) || - (task->psi_flags & clear) != clear) && - !psi_bug) { -- printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", -- task->pid, task->comm, task_cpu(task), -- task->psi_flags, clear, set); -+ pr_err("psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", -+ task->pid, task->comm, task_cpu(task), -+ task->psi_flags, clear, set); - psi_bug = 1; - } - -diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c -index bfef3f39b555..ef8228d19382 100644 ---- a/kernel/sched/rt.c -+++ b/kernel/sched/rt.c -@@ -977,7 +977,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) - */ - if (likely(rt_b->rt_runtime)) { - rt_rq->rt_throttled = 1; -- printk_deferred_once("sched: RT throttling activated\n"); -+ printk_once("sched: RT throttling activated\n"); - } else { - /* - * In case we did anyway, make it go away, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 4f432826933d..8df6227922aa 100644 +index 0e66749486e7..2a8f54801263 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2318,6 +2318,15 @@ extern void reweight_task(struct task_struct *p, int prio); +@@ -2300,6 +2300,15 @@ extern void reweight_task(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -8190,21 +7182,8 @@ index e1c655f928c7..f230b1ac7f91 100644 raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 4e8698e62f07..3d0157bd4e14 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd) - #ifdef HAVE_RT_PUSH_IPI - rd->rto_cpu = -1; - raw_spin_lock_init(&rd->rto_lock); -- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); -+ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); - #endif - - rd->visit_gen = 0; diff --git a/kernel/signal.c b/kernel/signal.c -index 5892c91696f8..d3a69e89b9ee 100644 +index dfcee3888b00..1424f77d3b95 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1324,6 +1324,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, @@ -8242,7 +7221,7 @@ index 5892c91696f8..d3a69e89b9ee 100644 spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; -@@ -2296,16 +2324,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t +@@ -2271,16 +2299,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); @@ -8260,7 +7239,7 @@ index 5892c91696f8..d3a69e89b9ee 100644 cgroup_leave_frozen(true); } else { diff --git a/kernel/smp.c b/kernel/smp.c -index f43ede0ab183..f0f26e1a0031 100644 +index 01a7c1706a58..250311c2009f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -690,10 +690,20 @@ void flush_smp_call_function_from_idle(void) @@ -8286,159 +7265,152 @@ index f43ede0ab183..f0f26e1a0031 100644 local_irq_restore(flags); } -diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c -index 003ccf338d20..00fc43605c6b 100644 ---- a/kernel/time/clockevents.c -+++ b/kernel/time/clockevents.c -@@ -203,8 +203,7 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) - { - /* Nothing to do if we already reached the limit */ - if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { -- printk_deferred(KERN_WARNING -- "CE: Reprogramming failure. Giving up\n"); -+ pr_warn("CE: Reprogramming failure. Giving up\n"); - dev->next_event = KTIME_MAX; - return -ETIME; - } -@@ -217,10 +216,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) - if (dev->min_delta_ns > MIN_DELTA_LIMIT) - dev->min_delta_ns = MIN_DELTA_LIMIT; - -- printk_deferred(KERN_WARNING -- "CE: %s increased min_delta_ns to %llu nsec\n", -- dev->name ? dev->name : "?", -- (unsigned long long) dev->min_delta_ns); -+ pr_warn("CE: %s increased min_delta_ns to %llu nsec\n", -+ dev->name ? dev->name : "?", (unsigned long long) dev->min_delta_ns); - return 0; +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 41f470929e99..22948c2109f5 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -624,6 +624,22 @@ static inline void tick_irq_exit(void) + #endif } -diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c -index 406dccb79c2b..829d7797811f 100644 ---- a/kernel/time/ntp.c -+++ b/kernel/time/ntp.c -@@ -939,9 +939,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) - time_status |= STA_PPSERROR; - pps_errcnt++; - pps_dec_freq_interval(); -- printk_deferred(KERN_ERR -- "hardpps: PPSERROR: interval too long - %lld s\n", -- freq_norm.sec); -+ pr_err("hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); - return 0; - } ++static DEFINE_PER_CPU(struct task_struct *, timersd); ++static DEFINE_PER_CPU(unsigned long, pending_timer_softirq); ++ ++static unsigned int local_pending_timers(void) ++{ ++ return __this_cpu_read(pending_timer_softirq); ++} ++ ++static void wake_timersd(void) ++{ ++ struct task_struct *tsk = __this_cpu_read(timersd); ++ ++ if (tsk) ++ wake_up_process(tsk); ++} ++ + static inline void __irq_exit_rcu(void) + { + #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED +@@ -635,6 +651,8 @@ static inline void __irq_exit_rcu(void) + preempt_count_sub(HARDIRQ_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !in_interrupt() && local_pending_timers()) ++ wake_timersd(); -@@ -954,8 +952,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) - delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); - pps_freq = ftemp; - if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { -- printk_deferred(KERN_WARNING -- "hardpps: PPSWANDER: change=%ld\n", delta); -+ pr_warn("hardpps: PPSWANDER: change=%ld\n", delta); - time_status |= STA_PPSWANDER; - pps_stbcnt++; - pps_dec_freq_interval(); -@@ -999,9 +996,8 @@ static void hardpps_update_phase(long error) - * the time offset is updated. - */ - if (jitter > (pps_jitter << PPS_POPCORN)) { -- printk_deferred(KERN_WARNING -- "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", -- jitter, (pps_jitter << PPS_POPCORN)); -+ pr_warn("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", -+ jitter, (pps_jitter << PPS_POPCORN)); - time_status |= STA_PPSJITTER; - pps_jitcnt++; - } else if (time_status & STA_PPSTIME) { -@@ -1058,7 +1054,7 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t - time_status |= STA_PPSJITTER; - /* restart the frequency calibration interval */ - pps_fbase = *raw_ts; -- printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); -+ pr_err("hardpps: PPSJITTER: bad pulse\n"); - return; - } + tick_irq_exit(); + } +@@ -963,11 +981,69 @@ static struct smp_hotplug_thread softirq_threads = { + .thread_comm = "ksoftirqd/%u", + }; -diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c -index b348749a9fc6..a81beb312038 100644 ---- a/kernel/time/timekeeping.c -+++ b/kernel/time/timekeeping.c -@@ -203,22 +203,23 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) - const char *name = tk->tkr_mono.clock->name; - - if (offset > max_cycles) { -- printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", -- offset, name, max_cycles); -- printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); -+ printk("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", -+ offset, name, max_cycles); -+ printk(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); - } else { - if (offset > (max_cycles >> 1)) { -- printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", -- offset, name, max_cycles >> 1); -- printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); -+ printk("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", -+ offset, name, max_cycles >> 1); -+ printk(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); - } ++static void timersd_setup(unsigned int cpu) ++{ ++ sched_set_fifo_low(current); ++} ++ ++static int timersd_should_run(unsigned int cpu) ++{ ++ return local_pending_timers(); ++} ++ ++static void run_timersd(unsigned int cpu) ++{ ++ unsigned int timer_si; ++ ++ ksoftirqd_run_begin(); ++ ++ timer_si = local_pending_timers(); ++ __this_cpu_write(pending_timer_softirq, 0); ++ or_softirq_pending(timer_si); ++ ++ __do_softirq(); ++ ++ ksoftirqd_run_end(); ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++static void raise_ktimers_thread(unsigned int nr) ++{ ++ trace_softirq_raise(nr); ++ __this_cpu_or(pending_timer_softirq, 1 << nr); ++} ++ ++void raise_hrtimer_softirq(void) ++{ ++ raise_ktimers_thread(HRTIMER_SOFTIRQ); ++} ++ ++void raise_timer_softirq(void) ++{ ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ raise_ktimers_thread(TIMER_SOFTIRQ); ++ wake_timersd(); ++ local_irq_restore(flags); ++} ++#endif ++ ++static struct smp_hotplug_thread timer_threads = { ++ .store = &timersd, ++ .setup = timersd_setup, ++ .thread_should_run = timersd_should_run, ++ .thread_fn = run_timersd, ++ .thread_comm = "ktimers/%u", ++}; ++ + static __init int spawn_ksoftirqd(void) + { + cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, + takeover_tasklets); + BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ BUG_ON(smpboot_register_percpu_thread(&timer_threads)); + + return 0; + } +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 0ea8702eb516..dead5e738ecf 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; +- raise_softirq_irqoff(HRTIMER_SOFTIRQ); ++ raise_hrtimer_softirq(); } - if (tk->underflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { -- printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); -- printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); -- printk_deferred(" Your kernel is probably still fine.\n"); -+ printk("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", -+ name); -+ printk(" Please report this, consider using a different clocksource, if possible.\n"); -+ printk(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->underflow_seen = 0; -@@ -226,9 +227,10 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) - - if (tk->overflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { -- printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); -- printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); -- printk_deferred(" Your kernel is probably still fine.\n"); -+ printk("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", -+ name); -+ printk(" Please report this, consider using a different clocksource, if possible.\n"); -+ printk(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->overflow_seen = 0; -@@ -1669,9 +1671,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, - const struct timespec64 *delta) - { - if (!timespec64_valid_strict(delta)) { -- printk_deferred(KERN_WARNING -- "__timekeeping_inject_sleeptime: Invalid " -- "sleep delta value!\n"); -+ pr_warn("%s: Invalid sleep delta value!\n", __func__); - return; + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); +@@ -1918,7 +1918,7 @@ void hrtimer_run_queues(void) + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; +- raise_softirq_irqoff(HRTIMER_SOFTIRQ); ++ raise_hrtimer_softirq(); } - tk_xtime_add(tk, delta); -diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c -index b73e8850e58d..149cc4b08d8e 100644 ---- a/kernel/time/timekeeping_debug.c -+++ b/kernel/time/timekeeping_debug.c -@@ -49,7 +49,7 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) - int bin = min(fls(t->tv_sec), NUM_BINS-1); - sleep_time_bin[bin]++; -- pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n", -+ pm_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n", - (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 85f1021ad459..beb4b1cc7c48 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1766,7 +1766,7 @@ static void run_local_timers(void) + if (time_before(jiffies, base->next_expiry)) + return; + } +- raise_softirq(TIMER_SOFTIRQ); ++ raise_timer_softirq(); } + /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c -index 18db461f77cd..547de22e8942 100644 +index 78ea542ce3bc..52fd4bbc36a4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c -@@ -2630,7 +2630,13 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) +@@ -2606,7 +2606,13 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; @@ -8453,7 +7425,7 @@ index 18db461f77cd..547de22e8942 100644 (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } -@@ -4206,15 +4212,17 @@ unsigned long trace_total_entries(struct trace_array *tr) +@@ -4182,15 +4188,17 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { @@ -8480,7 +7452,7 @@ index 18db461f77cd..547de22e8942 100644 } static void print_event_info(struct array_buffer *buf, struct seq_file *m) -@@ -4248,14 +4256,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file +@@ -4224,14 +4232,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); @@ -8506,7 +7478,7 @@ index 18db461f77cd..547de22e8942 100644 void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c -index 44d031ffe511..01165b0ed6aa 100644 +index 92be9cb1d7d4..b900902bf1b6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -184,6 +184,7 @@ static int trace_define_common_fields(void) @@ -8518,10 +7490,10 @@ index 44d031ffe511..01165b0ed6aa 100644 return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c -index c2ca40e8595b..be070d258c3b 100644 +index 3547e7176ff7..2745a023173a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c -@@ -451,6 +451,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { char hardsoft_irq; char need_resched; @@ -8529,7 +7501,7 @@ index c2ca40e8595b..be070d258c3b 100644 char irqs_off; int hardirq; int softirq; -@@ -481,6 +482,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +@@ -472,6 +473,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) break; } @@ -8539,7 +7511,7 @@ index c2ca40e8595b..be070d258c3b 100644 hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : -@@ -489,14 +493,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +@@ -480,14 +484,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.' ; @@ -8562,50 +7534,8 @@ index c2ca40e8595b..be070d258c3b 100644 if (entry->preempt_count & 0xf0) trace_seq_printf(s, "%x", entry->preempt_count >> 4); else -diff --git a/kernel/workqueue.c b/kernel/workqueue.c -index 76988f39ed5a..86b6c5a9b274 100644 ---- a/kernel/workqueue.c -+++ b/kernel/workqueue.c -@@ -4836,9 +4836,7 @@ void show_workqueue_state(void) - * drivers that queue work while holding locks - * also taken in their write paths. - */ -- printk_deferred_enter(); - show_pwq(pwq); -- printk_deferred_exit(); - } - raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); - /* -@@ -4862,7 +4860,6 @@ void show_workqueue_state(void) - * queue work while holding locks also taken in their write - * paths. - */ -- printk_deferred_enter(); - pr_info("pool %d:", pool->id); - pr_cont_pool_info(pool); - pr_cont(" hung=%us workers=%d", -@@ -4877,7 +4874,6 @@ void show_workqueue_state(void) - first = false; - } - pr_cont("\n"); -- printk_deferred_exit(); - next_pool: - raw_spin_unlock_irqrestore(&pool->lock, flags); - /* -diff --git a/lib/bug.c b/lib/bug.c -index 45a0584f6541..03a87df69ed2 100644 ---- a/lib/bug.c -+++ b/lib/bug.c -@@ -206,6 +206,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) - else - pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", - (void *)bugaddr); -+ pr_flush(1000, true); - - return BUG_TRAP_TYPE_BUG; - } diff --git a/lib/dump_stack.c b/lib/dump_stack.c -index 6b7f1bf6715d..6e8ae42c7e27 100644 +index 6b7f1bf6715d..83471e81501a 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -102,9 +102,9 @@ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) @@ -8613,10 +7543,10 @@ index 6b7f1bf6715d..6e8ae42c7e27 100644 * against other CPUs */ - printk_cpu_lock_irqsave(flags); -+ raw_printk_cpu_lock_irqsave(flags); ++ printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); - printk_cpu_unlock_irqrestore(flags); -+ raw_printk_cpu_unlock_irqrestore(flags); ++ printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); @@ -8639,7 +7569,7 @@ index 2f17b488d58e..2b9f797642f6 100644 return 0; } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c -index 161108e5d2fe..1266ea3726d7 100644 +index 71652e1c397c..8d24279fad05 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -26,6 +26,12 @@ @@ -8907,18 +7837,20 @@ index 161108e5d2fe..1266ea3726d7 100644 #define DO_TESTCASE_2x3(desc, name) \ DO_TESTCASE_3(desc, name, 12); \ -@@ -1651,6 +1700,20 @@ static void ww_test_fail_acquire(void) +@@ -1651,6 +1700,22 @@ static void ww_test_fail_acquire(void) #endif } +#ifdef CONFIG_PREEMPT_RT +#define ww_mutex_base_lock(b) rt_mutex_lock(b) ++#define ww_mutex_base_trylock(b) rt_mutex_trylock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) rt_mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) rt_mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) rt_mutex_lock_killable(b) +#define ww_mutex_base_unlock(b) rt_mutex_unlock(b) +#else +#define ww_mutex_base_lock(b) mutex_lock(b) ++#define ww_mutex_base_trylock(b) mutex_trylock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) mutex_lock_killable(b) @@ -8928,7 +7860,7 @@ index 161108e5d2fe..1266ea3726d7 100644 static void ww_test_normal(void) { int ret; -@@ -1665,50 +1728,50 @@ static void ww_test_normal(void) +@@ -1665,50 +1730,50 @@ static void ww_test_normal(void) /* mutex_lock (and indirectly, mutex_lock_nested) */ o.ctx = (void *)~0UL; @@ -8992,7 +7924,7 @@ index 161108e5d2fe..1266ea3726d7 100644 WARN_ON(o.ctx != (void *)~0UL); } -@@ -1721,7 +1784,7 @@ static void ww_test_two_contexts(void) +@@ -1721,7 +1786,7 @@ static void ww_test_two_contexts(void) static void ww_test_diff_class(void) { WWAI(&t); @@ -9001,7 +7933,7 @@ index 161108e5d2fe..1266ea3726d7 100644 t.ww_class = NULL; #endif WWL(&o, &t); -@@ -1785,7 +1848,7 @@ static void ww_test_edeadlk_normal(void) +@@ -1785,7 +1850,7 @@ static void ww_test_edeadlk_normal(void) { int ret; @@ -9010,7 +7942,7 @@ index 161108e5d2fe..1266ea3726d7 100644 o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); -@@ -1801,7 +1864,7 @@ static void ww_test_edeadlk_normal(void) +@@ -1801,7 +1866,7 @@ static void ww_test_edeadlk_normal(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); @@ -9019,7 +7951,7 @@ index 161108e5d2fe..1266ea3726d7 100644 WWU(&o); WWL(&o2, &t); -@@ -1811,7 +1874,7 @@ static void ww_test_edeadlk_normal_slow(void) +@@ -1811,7 +1876,7 @@ static void ww_test_edeadlk_normal_slow(void) { int ret; @@ -9028,7 +7960,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; -@@ -1827,7 +1890,7 @@ static void ww_test_edeadlk_normal_slow(void) +@@ -1827,7 +1892,7 @@ static void ww_test_edeadlk_normal_slow(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); @@ -9037,7 +7969,7 @@ index 161108e5d2fe..1266ea3726d7 100644 WWU(&o); ww_mutex_lock_slow(&o2, &t); -@@ -1837,7 +1900,7 @@ static void ww_test_edeadlk_no_unlock(void) +@@ -1837,7 +1902,7 @@ static void ww_test_edeadlk_no_unlock(void) { int ret; @@ -9046,7 +7978,7 @@ index 161108e5d2fe..1266ea3726d7 100644 o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); -@@ -1853,7 +1916,7 @@ static void ww_test_edeadlk_no_unlock(void) +@@ -1853,7 +1918,7 @@ static void ww_test_edeadlk_no_unlock(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); @@ -9055,7 +7987,7 @@ index 161108e5d2fe..1266ea3726d7 100644 WWL(&o2, &t); } -@@ -1862,7 +1925,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) +@@ -1862,7 +1927,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) { int ret; @@ -9064,7 +7996,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; -@@ -1878,7 +1941,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) +@@ -1878,7 +1943,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); @@ -9073,7 +8005,7 @@ index 161108e5d2fe..1266ea3726d7 100644 ww_mutex_lock_slow(&o2, &t); } -@@ -1887,7 +1950,7 @@ static void ww_test_edeadlk_acquire_more(void) +@@ -1887,7 +1952,7 @@ static void ww_test_edeadlk_acquire_more(void) { int ret; @@ -9082,7 +8014,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; -@@ -1908,7 +1971,7 @@ static void ww_test_edeadlk_acquire_more_slow(void) +@@ -1908,7 +1973,7 @@ static void ww_test_edeadlk_acquire_more_slow(void) { int ret; @@ -9091,7 +8023,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; -@@ -1929,11 +1992,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk(void) +@@ -1929,11 +1994,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk(void) { int ret; @@ -9105,7 +8037,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; -@@ -1955,11 +2018,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void) +@@ -1955,11 +2020,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void) { int ret; @@ -9119,7 +8051,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; -@@ -1980,7 +2043,7 @@ static void ww_test_edeadlk_acquire_wrong(void) +@@ -1980,7 +2045,7 @@ static void ww_test_edeadlk_acquire_wrong(void) { int ret; @@ -9128,7 +8060,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; -@@ -2005,7 +2068,7 @@ static void ww_test_edeadlk_acquire_wrong_slow(void) +@@ -2005,7 +2070,7 @@ static void ww_test_edeadlk_acquire_wrong_slow(void) { int ret; @@ -9137,7 +8069,7 @@ index 161108e5d2fe..1266ea3726d7 100644 mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; -@@ -2646,8 +2709,8 @@ static void wait_context_tests(void) +@@ -2646,8 +2711,8 @@ static void wait_context_tests(void) static void local_lock_2(void) { @@ -9148,7 +8080,7 @@ index 161108e5d2fe..1266ea3726d7 100644 HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ -@@ -2656,18 +2719,18 @@ static void local_lock_2(void) +@@ -2656,18 +2721,18 @@ static void local_lock_2(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); @@ -9171,7 +8103,7 @@ index 161108e5d2fe..1266ea3726d7 100644 HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ -@@ -2676,18 +2739,18 @@ static void local_lock_3A(void) +@@ -2676,18 +2741,18 @@ static void local_lock_3A(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); @@ -9194,7 +8126,7 @@ index 161108e5d2fe..1266ea3726d7 100644 HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ -@@ -2696,8 +2759,8 @@ static void local_lock_3B(void) +@@ -2696,8 +2761,8 @@ static void local_lock_3B(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); @@ -9205,7 +8137,7 @@ index 161108e5d2fe..1266ea3726d7 100644 spin_unlock(&lock_A); HARDIRQ_ENABLE(); -@@ -2812,7 +2875,7 @@ void locking_selftest(void) +@@ -2812,7 +2877,7 @@ void locking_selftest(void) printk("------------------------\n"); printk("| Locking API testsuite:\n"); printk("----------------------------------------------------------------------------\n"); @@ -9214,7 +8146,7 @@ index 161108e5d2fe..1266ea3726d7 100644 printk(" --------------------------------------------------------------------------\n"); init_shared_classes(); -@@ -2885,12 +2948,11 @@ void locking_selftest(void) +@@ -2885,12 +2950,11 @@ void locking_selftest(void) DO_TESTCASE_6x1RR("rlock W1R2/R2R3/W3W1", W1R2_R2R3_W3W1); printk(" --------------------------------------------------------------------------\n"); @@ -9229,7 +8161,7 @@ index 161108e5d2fe..1266ea3726d7 100644 DO_TESTCASE_6x6("safe-A + unsafe-B #1", irqsafe3); DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c -index 199ab201d501..06410209197a 100644 +index 199ab201d501..d01aec6ae15c 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -99,7 +99,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) @@ -9237,7 +8169,7 @@ index 199ab201d501..06410209197a 100644 * against other CPUs. */ - printk_cpu_lock_irqsave(flags); -+ raw_printk_cpu_lock_irqsave(flags); ++ printk_cpu_sync_get_irqsave(flags); if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); @@ -9246,115 +8178,1650 @@ index 199ab201d501..06410209197a 100644 dump_stack(); } - printk_cpu_unlock_irqrestore(flags); -+ raw_printk_cpu_unlock_irqrestore(flags); ++ printk_cpu_sync_put_irqrestore(flags); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } -diff --git a/lib/ratelimit.c b/lib/ratelimit.c -index e01a93f46f83..524cf65dce53 100644 ---- a/lib/ratelimit.c -+++ b/lib/ratelimit.c -@@ -47,9 +47,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) - if (time_is_before_jiffies(rs->begin + rs->interval)) { - if (rs->missed) { - if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { -- printk_deferred(KERN_WARNING -- "%s: %d callbacks suppressed\n", -- func, rs->missed); -+ pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); - rs->missed = 0; - } - } -diff --git a/lib/scatterlist.c b/lib/scatterlist.c -index abb3432ed744..d5e82e4a57ad 100644 ---- a/lib/scatterlist.c -+++ b/lib/scatterlist.c -@@ -828,8 +828,7 @@ static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) - * stops @miter. - * - * Context: -- * Don't care if @miter is stopped, or not proceeded yet. -- * Otherwise, preemption disabled if the SG_MITER_ATOMIC is set. -+ * Don't care. - * - * Returns: - * true if @miter contains the valid mapping. false if end of sg -@@ -865,8 +864,7 @@ EXPORT_SYMBOL(sg_miter_skip); - * @miter->addr and @miter->length point to the current mapping. - * - * Context: -- * Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled -- * till @miter is stopped. May sleep if !SG_MITER_ATOMIC. -+ * May sleep if !SG_MITER_ATOMIC. - * - * Returns: - * true if @miter contains the next mapping. false if end of sg -@@ -906,8 +904,7 @@ EXPORT_SYMBOL(sg_miter_next); - * need to be released during iteration. - * - * Context: -- * Preemption disabled if the SG_MITER_ATOMIC is set. Don't care -- * otherwise. -+ * Don't care otherwise. - */ - void sg_miter_stop(struct sg_mapping_iter *miter) +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 2ed5f2a0879d..eb6873f43ef5 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -169,7 +169,6 @@ struct mem_cgroup_event { + struct work_struct remove; + }; + +-static void mem_cgroup_threshold(struct mem_cgroup *memcg); + static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); + + /* Stuffs for move charges at task migration. */ +@@ -261,8 +260,10 @@ bool mem_cgroup_kmem_disabled(void) + return cgroup_memory_nokmem; + } + ++struct memcg_stock_pcp; + static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, +- unsigned int nr_pages); ++ unsigned int nr_pages, ++ bool stock_lock_acquried); + + static void obj_cgroup_release(struct percpu_ref *ref) { -@@ -922,7 +919,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) - flush_dcache_page(miter->page); +@@ -296,7 +297,7 @@ static void obj_cgroup_release(struct percpu_ref *ref) + nr_pages = nr_bytes >> PAGE_SHIFT; - if (miter->__flags & SG_MITER_ATOMIC) { -- WARN_ON_ONCE(preemptible()); -+ WARN_ON_ONCE(!pagefault_disabled()); - kunmap_atomic(miter->addr); - } else - kunmap(miter->page); -diff --git a/mm/Kconfig b/mm/Kconfig -index c048dea7e342..88778414465b 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -371,7 +371,7 @@ config NOMMU_INITIAL_TRIM_EXCESS - - config TRANSPARENT_HUGEPAGE - bool "Transparent Hugepage Support" -- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE -+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT - select COMPACTION - select XARRAY_MULTI - help -diff --git a/mm/memory.c b/mm/memory.c -index c52be6d6b605..e2c623027e32 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -5265,7 +5265,7 @@ void __might_fault(const char *file, int line) + if (nr_pages) +- obj_cgroup_uncharge_pages(objcg, nr_pages); ++ obj_cgroup_uncharge_pages(objcg, nr_pages, false); + + spin_lock_irqsave(&css_set_lock, flags); + list_del(&objcg->list); +@@ -521,43 +522,6 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) + return excess; + } + +-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) +-{ +- unsigned long excess; +- struct mem_cgroup_per_node *mz; +- struct mem_cgroup_tree_per_node *mctz; +- +- mctz = soft_limit_tree.rb_tree_per_node[nid]; +- if (!mctz) +- return; +- /* +- * Necessary to update all ancestors when hierarchy is used. +- * because their event counter is not touched. +- */ +- for (; memcg; memcg = parent_mem_cgroup(memcg)) { +- mz = memcg->nodeinfo[nid]; +- excess = soft_limit_excess(memcg); +- /* +- * We have to update the tree if mz is on RB-tree or +- * mem is over its softlimit. +- */ +- if (excess || mz->on_tree) { +- unsigned long flags; +- +- spin_lock_irqsave(&mctz->lock, flags); +- /* if on-tree, remove it */ +- if (mz->on_tree) +- __mem_cgroup_remove_exceeded(mz, mctz); +- /* +- * Insert again. mz->usage_in_excess will be updated. +- * If excess is 0, no tree ops. +- */ +- __mem_cgroup_insert_exceeded(mz, mctz, excess); +- spin_unlock_irqrestore(&mctz->lock, flags); +- } +- } +-} +- + static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) + { + struct mem_cgroup_tree_per_node *mctz; +@@ -699,6 +663,8 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + /* Update memcg */ + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + +@@ -706,6 +672,8 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + } + + /** +@@ -788,8 +756,12 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + if (mem_cgroup_disabled()) return; - if (pagefault_disabled()) + ++ if (IS_ENABLED(PREEMPT_RT)) ++ preempt_disable(); + __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + memcg_rstat_updated(memcg); ++ if (IS_ENABLED(PREEMPT_RT)) ++ preempt_enable(); + } + + static unsigned long memcg_events(struct mem_cgroup *memcg, int event) +@@ -821,50 +793,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, + __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); + } + +-static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, +- enum mem_cgroup_events_target target) +-{ +- unsigned long val, next; +- +- val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); +- next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); +- /* from time_after() in jiffies.h */ +- if ((long)(next - val) < 0) { +- switch (target) { +- case MEM_CGROUP_TARGET_THRESH: +- next = val + THRESHOLDS_EVENTS_TARGET; +- break; +- case MEM_CGROUP_TARGET_SOFTLIMIT: +- next = val + SOFTLIMIT_EVENTS_TARGET; +- break; +- default: +- break; +- } +- __this_cpu_write(memcg->vmstats_percpu->targets[target], next); +- return true; +- } +- return false; +-} +- +-/* +- * Check events in order. +- * +- */ +-static void memcg_check_events(struct mem_cgroup *memcg, int nid) +-{ +- /* threshold event is triggered in finer grain than soft limit */ +- if (unlikely(mem_cgroup_event_ratelimit(memcg, +- MEM_CGROUP_TARGET_THRESH))) { +- bool do_softlimit; +- +- do_softlimit = mem_cgroup_event_ratelimit(memcg, +- MEM_CGROUP_TARGET_SOFTLIMIT); +- mem_cgroup_threshold(memcg); +- if (unlikely(do_softlimit)) +- mem_cgroup_update_tree(memcg, nid); +- } +-} +- + struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) + { + /* +@@ -2091,26 +2019,40 @@ struct obj_stock { + }; + + struct memcg_stock_pcp { ++ /* Protects memcg_stock_pcp */ ++ local_lock_t stock_lock; + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; ++#ifndef CONFIG_PREEMPTION ++ /* Protects only task_obj */ ++ local_lock_t task_obj_lock; + struct obj_stock task_obj; ++#endif + struct obj_stock irq_obj; + + struct work_struct work; + unsigned long flags; + #define FLUSHING_CACHED_CHARGE 0 + }; +-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); ++static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { ++ .stock_lock = INIT_LOCAL_LOCK(stock_lock), ++#ifndef CONFIG_PREEMPTION ++ .task_obj_lock = INIT_LOCAL_LOCK(task_obj_lock), ++#endif ++}; + static DEFINE_MUTEX(percpu_charge_mutex); + + #ifdef CONFIG_MEMCG_KMEM +-static void drain_obj_stock(struct obj_stock *stock); ++static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock, ++ bool stock_lock_acquried); + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg); + + #else +-static inline void drain_obj_stock(struct obj_stock *stock) ++static inline struct obj_cgroup *drain_obj_stock(struct obj_stock *stock, ++ bool stock_lock_acquried) + { ++ return NULL; + } + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +@@ -2139,7 +2081,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + if (nr_pages > MEMCG_CHARGE_BATCH) + return ret; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { +@@ -2147,7 +2089,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + ret = true; + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + return ret; + } +@@ -2175,38 +2117,43 @@ static void drain_stock(struct memcg_stock_pcp *stock) + + static void drain_local_stock(struct work_struct *dummy) + { +- struct memcg_stock_pcp *stock; +- unsigned long flags; ++ struct memcg_stock_pcp *stock_pcp; ++ struct obj_cgroup *old; + + /* + * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled + */ +- local_irq_save(flags); ++#ifndef CONFIG_PREEMPTION ++ local_lock(&memcg_stock.task_obj_lock); ++ old = drain_obj_stock(&this_cpu_ptr(&memcg_stock)->task_obj, NULL); ++ local_unlock(&memcg_stock.task_obj_lock); ++ if (old) ++ obj_cgroup_put(old); ++#endif + +- stock = this_cpu_ptr(&memcg_stock); +- drain_obj_stock(&stock->irq_obj); +- if (in_task()) +- drain_obj_stock(&stock->task_obj); +- drain_stock(stock); +- clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); ++ local_lock_irq(&memcg_stock.stock_lock); ++ stock_pcp = this_cpu_ptr(&memcg_stock); ++ old = drain_obj_stock(&stock_pcp->irq_obj, stock_pcp); + +- local_irq_restore(flags); ++ drain_stock(stock_pcp); ++ clear_bit(FLUSHING_CACHED_CHARGE, &stock_pcp->flags); ++ ++ local_unlock_irq(&memcg_stock.stock_lock); ++ if (old) ++ obj_cgroup_put(old); + } + + /* + * Cache charges(val) to local per_cpu area. + * This will be consumed by consume_stock() function, later. + */ +-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ++static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + { +- struct memcg_stock_pcp *stock; +- unsigned long flags; +- +- local_irq_save(flags); ++ struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); + +- stock = this_cpu_ptr(&memcg_stock); ++ lockdep_assert_held(&stock->stock_lock); + if (stock->cached != memcg) { /* reset if necessary */ + drain_stock(stock); + css_get(&memcg->css); +@@ -2216,8 +2163,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + + if (stock->nr_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock); ++} + +- local_irq_restore(flags); ++static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages, ++ bool stock_lock_acquried) ++{ ++ unsigned long flags; ++ ++ if (stock_lock_acquried) { ++ __refill_stock(memcg, nr_pages); ++ return; ++ } ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ __refill_stock(memcg, nr_pages); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + } + + /* +@@ -2226,7 +2185,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + */ + static void drain_all_stock(struct mem_cgroup *root_memcg) + { +- int cpu, curcpu; ++ int cpu; + + /* If someone's already draining, avoid adding running more workers. */ + if (!mutex_trylock(&percpu_charge_mutex)) +@@ -2237,7 +2196,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ +- curcpu = get_cpu(); ++ cpus_read_lock(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *memcg; +@@ -2253,14 +2212,10 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + rcu_read_unlock(); + + if (flush && +- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { +- if (cpu == curcpu) +- drain_local_stock(&stock->work); +- else +- schedule_work_on(cpu, &stock->work); +- } ++ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) ++ schedule_work_on(cpu, &stock->work); + } +- put_cpu(); ++ cpus_read_unlock(); + mutex_unlock(&percpu_charge_mutex); + } + +@@ -2661,7 +2616,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, + + done_restock: + if (batch > nr_pages) +- refill_stock(memcg, batch - nr_pages); ++ refill_stock(memcg, batch - nr_pages, false); + + /* + * If the hierarchy is above the normal consumption range, schedule +@@ -2774,28 +2729,36 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) + * can only be accessed after disabling interrupt. User context code can + * access interrupt object stock, but not vice versa. + */ +-static inline struct obj_stock *get_obj_stock(unsigned long *pflags) ++static inline struct obj_stock *get_obj_stock(unsigned long *pflags, ++ bool *stock_lock_acquried) + { + struct memcg_stock_pcp *stock; + ++#ifndef CONFIG_PREEMPTION + if (likely(in_task())) { + *pflags = 0UL; +- preempt_disable(); ++ *stock_lock_acquried = false; ++ local_lock(&memcg_stock.task_obj_lock); + stock = this_cpu_ptr(&memcg_stock); + return &stock->task_obj; + } +- +- local_irq_save(*pflags); ++#endif ++ *stock_lock_acquried = true; ++ local_lock_irqsave(&memcg_stock.stock_lock, *pflags); + stock = this_cpu_ptr(&memcg_stock); + return &stock->irq_obj; + } + +-static inline void put_obj_stock(unsigned long flags) ++static inline void put_obj_stock(unsigned long flags, ++ bool stock_lock_acquried) + { +- if (likely(in_task())) +- preempt_enable(); +- else +- local_irq_restore(flags); ++#ifndef CONFIG_PREEMPTION ++ if (likely(!stock_lock_acquried)) { ++ local_unlock(&memcg_stock.task_obj_lock); ++ return; ++ } ++#endif ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + } + + /* +@@ -2973,7 +2936,8 @@ static void memcg_free_cache_id(int id) + * @nr_pages: number of pages to uncharge + */ + static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, +- unsigned int nr_pages) ++ unsigned int nr_pages, ++ bool stock_lock_acquried) + { + struct mem_cgroup *memcg; + +@@ -2981,7 +2945,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); +- refill_stock(memcg, nr_pages); ++ refill_stock(memcg, nr_pages, stock_lock_acquried); + + css_put(&memcg->css); + } +@@ -3055,7 +3019,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) return; -- __might_sleep(file, line, 0); -+ __might_sleep(file, line); - #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) - if (current->mm) - might_lock_read(¤t->mm->mmap_lock); -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 23d3339ac4e8..e71b9634a321 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -3149,9 +3149,9 @@ static void drain_local_pages_wq(struct work_struct *work) - * cpu which is alright but we also have to make sure to not move to - * a different one. + + objcg = __folio_objcg(folio); +- obj_cgroup_uncharge_pages(objcg, nr_pages); ++ obj_cgroup_uncharge_pages(objcg, nr_pages, false); + folio->memcg_data = 0; + obj_cgroup_put(objcg); + } +@@ -3063,17 +3027,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) + void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr) + { ++ bool stock_lock_acquried; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); ++ struct obj_cgroup *old = NULL; ++ struct obj_stock *stock; + int *bytes; + ++ stock = get_obj_stock(&flags, &stock_lock_acquried); + /* + * Save vmstat data in stock and skip vmstat array update unless + * accumulating over a page of vmstat data or when pgdat or idx + * changes. */ -- preempt_disable(); -+ migrate_disable(); - drain_local_pages(drain->zone); -- preempt_enable(); -+ migrate_enable(); + if (stock->cached_objcg != objcg) { +- drain_obj_stock(stock); ++ old = drain_obj_stock(stock, stock_lock_acquried); ++ + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; +@@ -3117,38 +3085,43 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + if (nr) + mod_objcg_mlstate(objcg, pgdat, idx, nr); + +- put_obj_stock(flags); ++ put_obj_stock(flags, stock_lock_acquried); ++ if (old) ++ obj_cgroup_put(old); + } + + static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + { ++ bool stock_lock_acquried; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); ++ struct obj_stock *stock; + bool ret = false; + ++ stock = get_obj_stock(&flags, &stock_lock_acquried); + if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + stock->nr_bytes -= nr_bytes; + ret = true; + } + +- put_obj_stock(flags); ++ put_obj_stock(flags, stock_lock_acquried); + + return ret; + } + +-static void drain_obj_stock(struct obj_stock *stock) ++static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock, ++ bool stock_lock_acquried) + { + struct obj_cgroup *old = stock->cached_objcg; + + if (!old) +- return; ++ return NULL; + + if (stock->nr_bytes) { + unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; + unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + + if (nr_pages) +- obj_cgroup_uncharge_pages(old, nr_pages); ++ obj_cgroup_uncharge_pages(old, nr_pages, stock_lock_acquried); + + /* + * The leftover is flushed to the centralized per-memcg value. +@@ -3183,8 +3156,8 @@ static void drain_obj_stock(struct obj_stock *stock) + stock->cached_pgdat = NULL; + } + +- obj_cgroup_put(old); + stock->cached_objcg = NULL; ++ return old; + } + + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +@@ -3192,11 +3165,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + { + struct mem_cgroup *memcg; + ++#ifndef CONFIG_PREEMPTION + if (in_task() && stock->task_obj.cached_objcg) { + memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + return true; + } ++#endif + if (stock->irq_obj.cached_objcg) { + memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) +@@ -3209,12 +3184,15 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + bool allow_uncharge) + { ++ bool stock_lock_acquried; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); ++ struct obj_stock *stock; + unsigned int nr_pages = 0; ++ struct obj_cgroup *old = NULL; + ++ stock = get_obj_stock(&flags, &stock_lock_acquried); + if (stock->cached_objcg != objcg) { /* reset if necessary */ +- drain_obj_stock(stock); ++ old = drain_obj_stock(stock, stock_lock_acquried); + obj_cgroup_get(objcg); + stock->cached_objcg = objcg; + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) +@@ -3228,10 +3206,12 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + stock->nr_bytes &= (PAGE_SIZE - 1); + } + +- put_obj_stock(flags); ++ put_obj_stock(flags, stock_lock_acquried); ++ if (old) ++ obj_cgroup_put(old); + + if (nr_pages) +- obj_cgroup_uncharge_pages(objcg, nr_pages); ++ obj_cgroup_uncharge_pages(objcg, nr_pages, false); + } + + int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +@@ -3751,8 +3731,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + } + break; + case RES_SOFT_LIMIT: ++#ifndef CONFIG_PREEMPT_RT + memcg->soft_limit = nr_pages; + ret = 0; ++#else ++ ret = -EOPNOTSUPP; ++#endif + break; + } + return ret ?: nbytes; +@@ -4057,119 +4041,454 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + return 0; + } + +-static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) ++static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) + { +- struct mem_cgroup_threshold_ary *t; +- unsigned long usage; +- int i; +- +- rcu_read_lock(); +- if (!swap) +- t = rcu_dereference(memcg->thresholds.primary); +- else +- t = rcu_dereference(memcg->memsw_thresholds.primary); +- +- if (!t) +- goto unlock; +- +- usage = mem_cgroup_usage(memcg, swap); ++ struct mem_cgroup_eventfd_list *ev; + +- /* +- * current_threshold points to threshold just below or equal to usage. +- * If it's not true, a threshold was crossed after last +- * call of __mem_cgroup_threshold(). +- */ +- i = t->current_threshold; ++ spin_lock(&memcg_oom_lock); + +- /* +- * Iterate backward over array of thresholds starting from +- * current_threshold and check if a threshold is crossed. +- * If none of thresholds below usage is crossed, we read +- * only one element of the array here. +- */ +- for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) +- eventfd_signal(t->entries[i].eventfd, 1); ++ list_for_each_entry(ev, &memcg->oom_notify, list) ++ eventfd_signal(ev->eventfd, 1); + +- /* i = current_threshold + 1 */ +- i++; ++ spin_unlock(&memcg_oom_lock); ++ return 0; ++} + +- /* +- * Iterate forward over array of thresholds starting from +- * current_threshold+1 and check if a threshold is crossed. +- * If none of thresholds above usage is crossed, we read +- * only one element of the array here. +- */ +- for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) +- eventfd_signal(t->entries[i].eventfd, 1); ++static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) ++{ ++ struct mem_cgroup *iter; + +- /* Update current_threshold */ +- t->current_threshold = i - 1; +-unlock: +- rcu_read_unlock(); ++ for_each_mem_cgroup_tree(iter, memcg) ++ mem_cgroup_oom_notify_cb(iter); + } + +-static void mem_cgroup_threshold(struct mem_cgroup *memcg) ++static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) + { +- while (memcg) { +- __mem_cgroup_threshold(memcg, false); +- if (do_memsw_account()) +- __mem_cgroup_threshold(memcg, true); ++ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); + +- memcg = parent_mem_cgroup(memcg); +- } ++ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); ++ seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); ++ seq_printf(sf, "oom_kill %lu\n", ++ atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); ++ return 0; + } + +-static int compare_thresholds(const void *a, const void *b) ++static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, ++ struct cftype *cft, u64 val) + { +- const struct mem_cgroup_threshold *_a = a; +- const struct mem_cgroup_threshold *_b = b; ++ struct mem_cgroup *memcg = mem_cgroup_from_css(css); + +- if (_a->threshold > _b->threshold) +- return 1; ++ /* cannot set to root cgroup and only 0 and 1 are allowed */ ++ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) ++ return -EINVAL; + +- if (_a->threshold < _b->threshold) +- return -1; ++ memcg->oom_kill_disable = val; ++ if (!val) ++ memcg_oom_recover(memcg); + + return 0; + } + +-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) +-{ +- struct mem_cgroup_eventfd_list *ev; +- +- spin_lock(&memcg_oom_lock); ++#ifdef CONFIG_CGROUP_WRITEBACK + +- list_for_each_entry(ev, &memcg->oom_notify, list) +- eventfd_signal(ev->eventfd, 1); ++#include <trace/events/writeback.h> + +- spin_unlock(&memcg_oom_lock); +- return 0; ++static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) ++{ ++ return wb_domain_init(&memcg->cgwb_domain, gfp); + } + +-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) ++static void memcg_wb_domain_exit(struct mem_cgroup *memcg) + { +- struct mem_cgroup *iter; ++ wb_domain_exit(&memcg->cgwb_domain); ++} + +- for_each_mem_cgroup_tree(iter, memcg) +- mem_cgroup_oom_notify_cb(iter); ++static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) ++{ ++ wb_domain_size_changed(&memcg->cgwb_domain); + } + +-static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, +- struct eventfd_ctx *eventfd, const char *args, enum res_type type) ++struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) + { +- struct mem_cgroup_thresholds *thresholds; +- struct mem_cgroup_threshold_ary *new; +- unsigned long threshold; +- unsigned long usage; +- int i, size, ret; ++ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + +- ret = page_counter_memparse(args, "-1", &threshold); +- if (ret) +- return ret; ++ if (!memcg->css.parent) ++ return NULL; + +- mutex_lock(&memcg->thresholds_lock); ++ return &memcg->cgwb_domain; ++} + +- if (type == _MEM) { ++/** ++ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg ++ * @wb: bdi_writeback in question ++ * @pfilepages: out parameter for number of file pages ++ * @pheadroom: out parameter for number of allocatable pages according to memcg ++ * @pdirty: out parameter for number of dirty pages ++ * @pwriteback: out parameter for number of pages under writeback ++ * ++ * Determine the numbers of file, headroom, dirty, and writeback pages in ++ * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom ++ * is a bit more involved. ++ * ++ * A memcg's headroom is "min(max, high) - used". In the hierarchy, the ++ * headroom is calculated as the lowest headroom of itself and the ++ * ancestors. Note that this doesn't consider the actual amount of ++ * available memory in the system. The caller should further cap ++ * *@pheadroom accordingly. ++ */ ++void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, ++ unsigned long *pheadroom, unsigned long *pdirty, ++ unsigned long *pwriteback) ++{ ++ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); ++ struct mem_cgroup *parent; ++ ++ mem_cgroup_flush_stats(); ++ ++ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); ++ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); ++ *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + ++ memcg_page_state(memcg, NR_ACTIVE_FILE); ++ ++ *pheadroom = PAGE_COUNTER_MAX; ++ while ((parent = parent_mem_cgroup(memcg))) { ++ unsigned long ceiling = min(READ_ONCE(memcg->memory.max), ++ READ_ONCE(memcg->memory.high)); ++ unsigned long used = page_counter_read(&memcg->memory); ++ ++ *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); ++ memcg = parent; ++ } ++} ++ ++/* ++ * Foreign dirty flushing ++ * ++ * There's an inherent mismatch between memcg and writeback. The former ++ * tracks ownership per-page while the latter per-inode. This was a ++ * deliberate design decision because honoring per-page ownership in the ++ * writeback path is complicated, may lead to higher CPU and IO overheads ++ * and deemed unnecessary given that write-sharing an inode across ++ * different cgroups isn't a common use-case. ++ * ++ * Combined with inode majority-writer ownership switching, this works well ++ * enough in most cases but there are some pathological cases. For ++ * example, let's say there are two cgroups A and B which keep writing to ++ * different but confined parts of the same inode. B owns the inode and ++ * A's memory is limited far below B's. A's dirty ratio can rise enough to ++ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid ++ * triggering background writeback. A will be slowed down without a way to ++ * make writeback of the dirty pages happen. ++ * ++ * Conditions like the above can lead to a cgroup getting repeatedly and ++ * severely throttled after making some progress after each ++ * dirty_expire_interval while the underlying IO device is almost ++ * completely idle. ++ * ++ * Solving this problem completely requires matching the ownership tracking ++ * granularities between memcg and writeback in either direction. However, ++ * the more egregious behaviors can be avoided by simply remembering the ++ * most recent foreign dirtying events and initiating remote flushes on ++ * them when local writeback isn't enough to keep the memory clean enough. ++ * ++ * The following two functions implement such mechanism. When a foreign ++ * page - a page whose memcg and writeback ownerships don't match - is ++ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning ++ * bdi_writeback on the page owning memcg. When balance_dirty_pages() ++ * decides that the memcg needs to sleep due to high dirty ratio, it calls ++ * mem_cgroup_flush_foreign() which queues writeback on the recorded ++ * foreign bdi_writebacks which haven't expired. Both the numbers of ++ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are ++ * limited to MEMCG_CGWB_FRN_CNT. ++ * ++ * The mechanism only remembers IDs and doesn't hold any object references. ++ * As being wrong occasionally doesn't matter, updates and accesses to the ++ * records are lockless and racy. ++ */ ++void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, ++ struct bdi_writeback *wb) ++{ ++ struct mem_cgroup *memcg = folio_memcg(folio); ++ struct memcg_cgwb_frn *frn; ++ u64 now = get_jiffies_64(); ++ u64 oldest_at = now; ++ int oldest = -1; ++ int i; ++ ++ trace_track_foreign_dirty(folio, wb); ++ ++ /* ++ * Pick the slot to use. If there is already a slot for @wb, keep ++ * using it. If not replace the oldest one which isn't being ++ * written out. ++ */ ++ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { ++ frn = &memcg->cgwb_frn[i]; ++ if (frn->bdi_id == wb->bdi->id && ++ frn->memcg_id == wb->memcg_css->id) ++ break; ++ if (time_before64(frn->at, oldest_at) && ++ atomic_read(&frn->done.cnt) == 1) { ++ oldest = i; ++ oldest_at = frn->at; ++ } ++ } ++ ++ if (i < MEMCG_CGWB_FRN_CNT) { ++ /* ++ * Re-using an existing one. Update timestamp lazily to ++ * avoid making the cacheline hot. We want them to be ++ * reasonably up-to-date and significantly shorter than ++ * dirty_expire_interval as that's what expires the record. ++ * Use the shorter of 1s and dirty_expire_interval / 8. ++ */ ++ unsigned long update_intv = ++ min_t(unsigned long, HZ, ++ msecs_to_jiffies(dirty_expire_interval * 10) / 8); ++ ++ if (time_before64(frn->at, now - update_intv)) ++ frn->at = now; ++ } else if (oldest >= 0) { ++ /* replace the oldest free one */ ++ frn = &memcg->cgwb_frn[oldest]; ++ frn->bdi_id = wb->bdi->id; ++ frn->memcg_id = wb->memcg_css->id; ++ frn->at = now; ++ } ++} ++ ++/* issue foreign writeback flushes for recorded foreign dirtying events */ ++void mem_cgroup_flush_foreign(struct bdi_writeback *wb) ++{ ++ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); ++ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); ++ u64 now = jiffies_64; ++ int i; ++ ++ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { ++ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; ++ ++ /* ++ * If the record is older than dirty_expire_interval, ++ * writeback on it has already started. No need to kick it ++ * off again. Also, don't start a new one if there's ++ * already one in flight. ++ */ ++ if (time_after64(frn->at, now - intv) && ++ atomic_read(&frn->done.cnt) == 1) { ++ frn->at = 0; ++ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); ++ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, ++ WB_REASON_FOREIGN_FLUSH, ++ &frn->done); ++ } ++ } ++} ++ ++#else /* CONFIG_CGROUP_WRITEBACK */ ++ ++static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) ++{ ++ return 0; ++} ++ ++static void memcg_wb_domain_exit(struct mem_cgroup *memcg) ++{ ++} ++ ++static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) ++{ ++} ++ ++#endif /* CONFIG_CGROUP_WRITEBACK */ ++ ++#ifndef CONFIG_PREEMPT_RT ++/* ++ * DO NOT USE IN NEW FILES. ++ * ++ * "cgroup.event_control" implementation. ++ * ++ * This is way over-engineered. It tries to support fully configurable ++ * events for each user. Such level of flexibility is completely ++ * unnecessary especially in the light of the planned unified hierarchy. ++ * ++ * Please deprecate this and replace with something simpler if at all ++ * possible. ++ */ ++ ++static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, ++ enum mem_cgroup_events_target target) ++{ ++ unsigned long val, next; ++ ++ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); ++ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); ++ /* from time_after() in jiffies.h */ ++ if ((long)(next - val) < 0) { ++ switch (target) { ++ case MEM_CGROUP_TARGET_THRESH: ++ next = val + THRESHOLDS_EVENTS_TARGET; ++ break; ++ case MEM_CGROUP_TARGET_SOFTLIMIT: ++ next = val + SOFTLIMIT_EVENTS_TARGET; ++ break; ++ default: ++ break; ++ } ++ __this_cpu_write(memcg->vmstats_percpu->targets[target], next); ++ return true; ++ } ++ return false; ++} ++ ++static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) ++{ ++ unsigned long excess; ++ struct mem_cgroup_per_node *mz; ++ struct mem_cgroup_tree_per_node *mctz; ++ ++ mctz = soft_limit_tree.rb_tree_per_node[nid]; ++ if (!mctz) ++ return; ++ /* ++ * Necessary to update all ancestors when hierarchy is used. ++ * because their event counter is not touched. ++ */ ++ for (; memcg; memcg = parent_mem_cgroup(memcg)) { ++ mz = memcg->nodeinfo[nid]; ++ excess = soft_limit_excess(memcg); ++ /* ++ * We have to update the tree if mz is on RB-tree or ++ * mem is over its softlimit. ++ */ ++ if (excess || mz->on_tree) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&mctz->lock, flags); ++ /* if on-tree, remove it */ ++ if (mz->on_tree) ++ __mem_cgroup_remove_exceeded(mz, mctz); ++ /* ++ * Insert again. mz->usage_in_excess will be updated. ++ * If excess is 0, no tree ops. ++ */ ++ __mem_cgroup_insert_exceeded(mz, mctz, excess); ++ spin_unlock_irqrestore(&mctz->lock, flags); ++ } ++ } ++} ++ ++static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) ++{ ++ struct mem_cgroup_threshold_ary *t; ++ unsigned long usage; ++ int i; ++ ++ rcu_read_lock(); ++ if (!swap) ++ t = rcu_dereference(memcg->thresholds.primary); ++ else ++ t = rcu_dereference(memcg->memsw_thresholds.primary); ++ ++ if (!t) ++ goto unlock; ++ ++ usage = mem_cgroup_usage(memcg, swap); ++ ++ /* ++ * current_threshold points to threshold just below or equal to usage. ++ * If it's not true, a threshold was crossed after last ++ * call of __mem_cgroup_threshold(). ++ */ ++ i = t->current_threshold; ++ ++ /* ++ * Iterate backward over array of thresholds starting from ++ * current_threshold and check if a threshold is crossed. ++ * If none of thresholds below usage is crossed, we read ++ * only one element of the array here. ++ */ ++ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) ++ eventfd_signal(t->entries[i].eventfd, 1); ++ ++ /* i = current_threshold + 1 */ ++ i++; ++ ++ /* ++ * Iterate forward over array of thresholds starting from ++ * current_threshold+1 and check if a threshold is crossed. ++ * If none of thresholds above usage is crossed, we read ++ * only one element of the array here. ++ */ ++ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) ++ eventfd_signal(t->entries[i].eventfd, 1); ++ ++ /* Update current_threshold */ ++ t->current_threshold = i - 1; ++unlock: ++ rcu_read_unlock(); ++} ++ ++static void mem_cgroup_threshold(struct mem_cgroup *memcg) ++{ ++ while (memcg) { ++ __mem_cgroup_threshold(memcg, false); ++ if (do_memsw_account()) ++ __mem_cgroup_threshold(memcg, true); ++ ++ memcg = parent_mem_cgroup(memcg); ++ } ++} ++ ++/* ++ * Check events in order. ++ * ++ */ ++static void memcg_check_events(struct mem_cgroup *memcg, int nid) ++{ ++ /* threshold event is triggered in finer grain than soft limit */ ++ if (unlikely(mem_cgroup_event_ratelimit(memcg, ++ MEM_CGROUP_TARGET_THRESH))) { ++ bool do_softlimit; ++ ++ do_softlimit = mem_cgroup_event_ratelimit(memcg, ++ MEM_CGROUP_TARGET_SOFTLIMIT); ++ mem_cgroup_threshold(memcg); ++ if (unlikely(do_softlimit)) ++ mem_cgroup_update_tree(memcg, nid); ++ } ++} ++ ++static int compare_thresholds(const void *a, const void *b) ++{ ++ const struct mem_cgroup_threshold *_a = a; ++ const struct mem_cgroup_threshold *_b = b; ++ ++ if (_a->threshold > _b->threshold) ++ return 1; ++ ++ if (_a->threshold < _b->threshold) ++ return -1; ++ ++ return 0; ++} ++ ++static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, ++ struct eventfd_ctx *eventfd, const char *args, enum res_type type) ++{ ++ struct mem_cgroup_thresholds *thresholds; ++ struct mem_cgroup_threshold_ary *new; ++ unsigned long threshold; ++ unsigned long usage; ++ int i, size, ret; ++ ++ ret = page_counter_memparse(args, "-1", &threshold); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&memcg->thresholds_lock); ++ ++ if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { +@@ -4256,384 +4575,131 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + + mutex_lock(&memcg->thresholds_lock); + +- if (type == _MEM) { +- thresholds = &memcg->thresholds; +- usage = mem_cgroup_usage(memcg, false); +- } else if (type == _MEMSWAP) { +- thresholds = &memcg->memsw_thresholds; +- usage = mem_cgroup_usage(memcg, true); +- } else +- BUG(); +- +- if (!thresholds->primary) +- goto unlock; +- +- /* Check if a threshold crossed before removing */ +- __mem_cgroup_threshold(memcg, type == _MEMSWAP); +- +- /* Calculate new number of threshold */ +- size = entries = 0; +- for (i = 0; i < thresholds->primary->size; i++) { +- if (thresholds->primary->entries[i].eventfd != eventfd) +- size++; +- else +- entries++; +- } +- +- new = thresholds->spare; +- +- /* If no items related to eventfd have been cleared, nothing to do */ +- if (!entries) +- goto unlock; +- +- /* Set thresholds array to NULL if we don't have thresholds */ +- if (!size) { +- kfree(new); +- new = NULL; +- goto swap_buffers; +- } +- +- new->size = size; +- +- /* Copy thresholds and find current threshold */ +- new->current_threshold = -1; +- for (i = 0, j = 0; i < thresholds->primary->size; i++) { +- if (thresholds->primary->entries[i].eventfd == eventfd) +- continue; +- +- new->entries[j] = thresholds->primary->entries[i]; +- if (new->entries[j].threshold <= usage) { +- /* +- * new->current_threshold will not be used +- * until rcu_assign_pointer(), so it's safe to increment +- * it here. +- */ +- ++new->current_threshold; +- } +- j++; +- } +- +-swap_buffers: +- /* Swap primary and spare array */ +- thresholds->spare = thresholds->primary; +- +- rcu_assign_pointer(thresholds->primary, new); +- +- /* To be sure that nobody uses thresholds */ +- synchronize_rcu(); +- +- /* If all events are unregistered, free the spare array */ +- if (!new) { +- kfree(thresholds->spare); +- thresholds->spare = NULL; +- } +-unlock: +- mutex_unlock(&memcg->thresholds_lock); +-} +- +-static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +- struct eventfd_ctx *eventfd) +-{ +- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +-} +- +-static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +- struct eventfd_ctx *eventfd) +-{ +- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +-} +- +-static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, +- struct eventfd_ctx *eventfd, const char *args) +-{ +- struct mem_cgroup_eventfd_list *event; +- +- event = kmalloc(sizeof(*event), GFP_KERNEL); +- if (!event) +- return -ENOMEM; +- +- spin_lock(&memcg_oom_lock); +- +- event->eventfd = eventfd; +- list_add(&event->list, &memcg->oom_notify); +- +- /* already in OOM ? */ +- if (memcg->under_oom) +- eventfd_signal(eventfd, 1); +- spin_unlock(&memcg_oom_lock); +- +- return 0; +-} +- +-static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, +- struct eventfd_ctx *eventfd) +-{ +- struct mem_cgroup_eventfd_list *ev, *tmp; +- +- spin_lock(&memcg_oom_lock); +- +- list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { +- if (ev->eventfd == eventfd) { +- list_del(&ev->list); +- kfree(ev); +- } +- } +- +- spin_unlock(&memcg_oom_lock); +-} +- +-static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) +-{ +- struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); +- +- seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); +- seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); +- seq_printf(sf, "oom_kill %lu\n", +- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); +- return 0; +-} +- +-static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, +- struct cftype *cft, u64 val) +-{ +- struct mem_cgroup *memcg = mem_cgroup_from_css(css); +- +- /* cannot set to root cgroup and only 0 and 1 are allowed */ +- if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) +- return -EINVAL; +- +- memcg->oom_kill_disable = val; +- if (!val) +- memcg_oom_recover(memcg); +- +- return 0; +-} +- +-#ifdef CONFIG_CGROUP_WRITEBACK +- +-#include <trace/events/writeback.h> +- +-static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +-{ +- return wb_domain_init(&memcg->cgwb_domain, gfp); +-} +- +-static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +-{ +- wb_domain_exit(&memcg->cgwb_domain); +-} +- +-static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +-{ +- wb_domain_size_changed(&memcg->cgwb_domain); +-} +- +-struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +-{ +- struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); +- +- if (!memcg->css.parent) +- return NULL; +- +- return &memcg->cgwb_domain; +-} +- +-/** +- * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg +- * @wb: bdi_writeback in question +- * @pfilepages: out parameter for number of file pages +- * @pheadroom: out parameter for number of allocatable pages according to memcg +- * @pdirty: out parameter for number of dirty pages +- * @pwriteback: out parameter for number of pages under writeback +- * +- * Determine the numbers of file, headroom, dirty, and writeback pages in +- * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom +- * is a bit more involved. +- * +- * A memcg's headroom is "min(max, high) - used". In the hierarchy, the +- * headroom is calculated as the lowest headroom of itself and the +- * ancestors. Note that this doesn't consider the actual amount of +- * available memory in the system. The caller should further cap +- * *@pheadroom accordingly. +- */ +-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, +- unsigned long *pheadroom, unsigned long *pdirty, +- unsigned long *pwriteback) +-{ +- struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); +- struct mem_cgroup *parent; +- +- mem_cgroup_flush_stats(); ++ if (type == _MEM) { ++ thresholds = &memcg->thresholds; ++ usage = mem_cgroup_usage(memcg, false); ++ } else if (type == _MEMSWAP) { ++ thresholds = &memcg->memsw_thresholds; ++ usage = mem_cgroup_usage(memcg, true); ++ } else ++ BUG(); + +- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); +- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); +- *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + +- memcg_page_state(memcg, NR_ACTIVE_FILE); ++ if (!thresholds->primary) ++ goto unlock; + +- *pheadroom = PAGE_COUNTER_MAX; +- while ((parent = parent_mem_cgroup(memcg))) { +- unsigned long ceiling = min(READ_ONCE(memcg->memory.max), +- READ_ONCE(memcg->memory.high)); +- unsigned long used = page_counter_read(&memcg->memory); ++ /* Check if a threshold crossed before removing */ ++ __mem_cgroup_threshold(memcg, type == _MEMSWAP); + +- *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); +- memcg = parent; ++ /* Calculate new number of threshold */ ++ size = entries = 0; ++ for (i = 0; i < thresholds->primary->size; i++) { ++ if (thresholds->primary->entries[i].eventfd != eventfd) ++ size++; ++ else ++ entries++; + } +-} + +-/* +- * Foreign dirty flushing +- * +- * There's an inherent mismatch between memcg and writeback. The former +- * tracks ownership per-page while the latter per-inode. This was a +- * deliberate design decision because honoring per-page ownership in the +- * writeback path is complicated, may lead to higher CPU and IO overheads +- * and deemed unnecessary given that write-sharing an inode across +- * different cgroups isn't a common use-case. +- * +- * Combined with inode majority-writer ownership switching, this works well +- * enough in most cases but there are some pathological cases. For +- * example, let's say there are two cgroups A and B which keep writing to +- * different but confined parts of the same inode. B owns the inode and +- * A's memory is limited far below B's. A's dirty ratio can rise enough to +- * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid +- * triggering background writeback. A will be slowed down without a way to +- * make writeback of the dirty pages happen. +- * +- * Conditions like the above can lead to a cgroup getting repeatedly and +- * severely throttled after making some progress after each +- * dirty_expire_interval while the underlying IO device is almost +- * completely idle. +- * +- * Solving this problem completely requires matching the ownership tracking +- * granularities between memcg and writeback in either direction. However, +- * the more egregious behaviors can be avoided by simply remembering the +- * most recent foreign dirtying events and initiating remote flushes on +- * them when local writeback isn't enough to keep the memory clean enough. +- * +- * The following two functions implement such mechanism. When a foreign +- * page - a page whose memcg and writeback ownerships don't match - is +- * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning +- * bdi_writeback on the page owning memcg. When balance_dirty_pages() +- * decides that the memcg needs to sleep due to high dirty ratio, it calls +- * mem_cgroup_flush_foreign() which queues writeback on the recorded +- * foreign bdi_writebacks which haven't expired. Both the numbers of +- * recorded bdi_writebacks and concurrent in-flight foreign writebacks are +- * limited to MEMCG_CGWB_FRN_CNT. +- * +- * The mechanism only remembers IDs and doesn't hold any object references. +- * As being wrong occasionally doesn't matter, updates and accesses to the +- * records are lockless and racy. +- */ +-void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, +- struct bdi_writeback *wb) +-{ +- struct mem_cgroup *memcg = folio_memcg(folio); +- struct memcg_cgwb_frn *frn; +- u64 now = get_jiffies_64(); +- u64 oldest_at = now; +- int oldest = -1; +- int i; ++ new = thresholds->spare; + +- trace_track_foreign_dirty(folio, wb); ++ /* If no items related to eventfd have been cleared, nothing to do */ ++ if (!entries) ++ goto unlock; + +- /* +- * Pick the slot to use. If there is already a slot for @wb, keep +- * using it. If not replace the oldest one which isn't being +- * written out. +- */ +- for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { +- frn = &memcg->cgwb_frn[i]; +- if (frn->bdi_id == wb->bdi->id && +- frn->memcg_id == wb->memcg_css->id) +- break; +- if (time_before64(frn->at, oldest_at) && +- atomic_read(&frn->done.cnt) == 1) { +- oldest = i; +- oldest_at = frn->at; +- } ++ /* Set thresholds array to NULL if we don't have thresholds */ ++ if (!size) { ++ kfree(new); ++ new = NULL; ++ goto swap_buffers; + } + +- if (i < MEMCG_CGWB_FRN_CNT) { +- /* +- * Re-using an existing one. Update timestamp lazily to +- * avoid making the cacheline hot. We want them to be +- * reasonably up-to-date and significantly shorter than +- * dirty_expire_interval as that's what expires the record. +- * Use the shorter of 1s and dirty_expire_interval / 8. +- */ +- unsigned long update_intv = +- min_t(unsigned long, HZ, +- msecs_to_jiffies(dirty_expire_interval * 10) / 8); ++ new->size = size; + +- if (time_before64(frn->at, now - update_intv)) +- frn->at = now; +- } else if (oldest >= 0) { +- /* replace the oldest free one */ +- frn = &memcg->cgwb_frn[oldest]; +- frn->bdi_id = wb->bdi->id; +- frn->memcg_id = wb->memcg_css->id; +- frn->at = now; ++ /* Copy thresholds and find current threshold */ ++ new->current_threshold = -1; ++ for (i = 0, j = 0; i < thresholds->primary->size; i++) { ++ if (thresholds->primary->entries[i].eventfd == eventfd) ++ continue; ++ ++ new->entries[j] = thresholds->primary->entries[i]; ++ if (new->entries[j].threshold <= usage) { ++ /* ++ * new->current_threshold will not be used ++ * until rcu_assign_pointer(), so it's safe to increment ++ * it here. ++ */ ++ ++new->current_threshold; ++ } ++ j++; + } +-} + +-/* issue foreign writeback flushes for recorded foreign dirtying events */ +-void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +-{ +- struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); +- unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); +- u64 now = jiffies_64; +- int i; ++swap_buffers: ++ /* Swap primary and spare array */ ++ thresholds->spare = thresholds->primary; + +- for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { +- struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; ++ rcu_assign_pointer(thresholds->primary, new); + +- /* +- * If the record is older than dirty_expire_interval, +- * writeback on it has already started. No need to kick it +- * off again. Also, don't start a new one if there's +- * already one in flight. +- */ +- if (time_after64(frn->at, now - intv) && +- atomic_read(&frn->done.cnt) == 1) { +- frn->at = 0; +- trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); +- cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, +- WB_REASON_FOREIGN_FLUSH, +- &frn->done); +- } ++ /* To be sure that nobody uses thresholds */ ++ synchronize_rcu(); ++ ++ /* If all events are unregistered, free the spare array */ ++ if (!new) { ++ kfree(thresholds->spare); ++ thresholds->spare = NULL; + } ++unlock: ++ mutex_unlock(&memcg->thresholds_lock); + } + +-#else /* CONFIG_CGROUP_WRITEBACK */ +- +-static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) ++static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, ++ struct eventfd_ctx *eventfd) + { +- return 0; ++ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); + } + +-static void memcg_wb_domain_exit(struct mem_cgroup *memcg) ++static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, ++ struct eventfd_ctx *eventfd) + { ++ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); + } + +-static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) ++static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, ++ struct eventfd_ctx *eventfd, const char *args) + { ++ struct mem_cgroup_eventfd_list *event; ++ ++ event = kmalloc(sizeof(*event), GFP_KERNEL); ++ if (!event) ++ return -ENOMEM; ++ ++ spin_lock(&memcg_oom_lock); ++ ++ event->eventfd = eventfd; ++ list_add(&event->list, &memcg->oom_notify); ++ ++ /* already in OOM ? */ ++ if (memcg->under_oom) ++ eventfd_signal(eventfd, 1); ++ spin_unlock(&memcg_oom_lock); ++ ++ return 0; + } + +-#endif /* CONFIG_CGROUP_WRITEBACK */ ++static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, ++ struct eventfd_ctx *eventfd) ++{ ++ struct mem_cgroup_eventfd_list *ev, *tmp; + +-/* +- * DO NOT USE IN NEW FILES. +- * +- * "cgroup.event_control" implementation. +- * +- * This is way over-engineered. It tries to support fully configurable +- * events for each user. Such level of flexibility is completely +- * unnecessary especially in the light of the planned unified hierarchy. +- * +- * Please deprecate this and replace with something simpler if at all +- * possible. +- */ ++ spin_lock(&memcg_oom_lock); ++ ++ list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { ++ if (ev->eventfd == eventfd) { ++ list_del(&ev->list); ++ kfree(ev); ++ } ++ } ++ ++ spin_unlock(&memcg_oom_lock); ++} + + /* + * Unregister event and free resources. +@@ -4845,6 +4911,18 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + return ret; + } + ++#else ++ ++static ssize_t memcg_write_event_control(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, loff_t off) ++{ ++ return -EOPNOTSUPP; ++} ++ ++static void memcg_check_events(struct mem_cgroup *memcg, int nid) { } ++ ++#endif ++ + static struct cftype mem_cgroup_legacy_files[] = { + { + .name = "usage_in_bytes", +@@ -7017,7 +7095,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) + + mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); + +- refill_stock(memcg, nr_pages); ++ refill_stock(memcg, nr_pages, false); } - /* + static int __init cgroup_memory(char *s) +@@ -7157,9 +7235,18 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + * i_pages lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. ++ * On PREEMPT_RT interrupts are never disabled and the updates to per-CPU ++ * variables are synchronised by keeping preemption disabled. + */ +- VM_BUG_ON(!irqs_disabled()); +- mem_cgroup_charge_statistics(memcg, -nr_entries); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ VM_BUG_ON(!irqs_disabled()); ++ mem_cgroup_charge_statistics(memcg, -nr_entries); ++ } else { ++ preempt_disable(); ++ mem_cgroup_charge_statistics(memcg, -nr_entries); ++ preempt_enable(); ++ } ++ + memcg_check_events(memcg, page_to_nid(page)); + + css_put(&memcg->css); diff --git a/mm/vmalloc.c b/mm/vmalloc.c -index e8a807c78110..3c5be440ab30 100644 +index d2a00ad4e1dd..ce8dcad7639f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c -@@ -1918,11 +1918,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) +@@ -1922,11 +1922,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_PTR(err); } @@ -9369,7 +9836,7 @@ index e8a807c78110..3c5be440ab30 100644 return vaddr; } -@@ -2001,7 +2002,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +@@ -2005,7 +2006,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) order = get_order(size); rcu_read_lock(); @@ -9379,7 +9846,7 @@ index e8a807c78110..3c5be440ab30 100644 list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; -@@ -2024,7 +2026,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +@@ -2028,7 +2030,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) break; } @@ -9389,7 +9856,7 @@ index e8a807c78110..3c5be440ab30 100644 /* Allocate new block if nothing was found */ diff --git a/mm/workingset.c b/mm/workingset.c -index d5b81e4f4cbe..d7ceee62e3c6 100644 +index 8c03afe1d67c..4579883eb109 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -433,6 +433,8 @@ static struct list_lru shadow_nodes; @@ -9412,10 +9879,25 @@ index d5b81e4f4cbe..d7ceee62e3c6 100644 if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c -index b897ce3b399a..6a58c1df0cc7 100644 +index b897ce3b399a..7e03cc9363bb 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c -@@ -57,6 +57,7 @@ +@@ -30,6 +30,14 @@ + + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + ++/* ++ * lock ordering: ++ * page_lock ++ * pool->migrate_lock ++ * class->lock ++ * zspage->lock ++ */ ++ + #include <linux/module.h> + #include <linux/kernel.h> + #include <linux/sched.h> +@@ -57,6 +65,7 @@ #include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> @@ -9423,28 +9905,82 @@ index b897ce3b399a..6a58c1df0cc7 100644 #define ZSPAGE_MAGIC 0x58 -@@ -77,6 +78,20 @@ +@@ -100,15 +109,6 @@ - #define ZS_HANDLE_SIZE (sizeof(unsigned long)) + #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) -+#ifdef CONFIG_PREEMPT_RT -+ -+struct zsmalloc_handle { -+ unsigned long addr; -+ spinlock_t lock; -+}; -+ -+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) -+ -+#else -+ -+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) -+#endif -+ +-/* +- * Memory for allocating for handle keeps object position by +- * encoding <page, obj_idx> and the encoded value has a room +- * in least bit(ie, look at obj_to_location). +- * We use the bit to synchronize between object access by +- * user and migration. +- */ +-#define HANDLE_PIN_BIT 0 +- + /* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. +@@ -121,6 +121,7 @@ + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) + #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) + ++#define HUGE_BITS 1 + #define FULLNESS_BITS 2 + #define CLASS_BITS 8 + #define ISOLATED_BITS 3 +@@ -158,7 +159,7 @@ enum fullness_group { + NR_ZS_FULLNESS, + }; + +-enum zs_stat_type { ++enum class_stat_type { + CLASS_EMPTY, + CLASS_ALMOST_EMPTY, + CLASS_ALMOST_FULL, +@@ -213,22 +214,6 @@ struct size_class { + struct zs_size_stat stats; + }; + +-/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +-static void SetPageHugeObject(struct page *page) +-{ +- SetPageOwnerPriv1(page); +-} +- +-static void ClearPageHugeObject(struct page *page) +-{ +- ClearPageOwnerPriv1(page); +-} +- +-static int PageHugeObject(struct page *page) +-{ +- return PageOwnerPriv1(page); +-} +- /* - * Object location (<PFN>, <obj_idx>) is encoded as - * a single (unsigned long) handle value. -@@ -293,6 +308,7 @@ struct zspage { + * Placed within free objects to form a singly linked list. + * For every zspage, zspage->freeobj gives head of this list. +@@ -269,15 +254,14 @@ struct zs_pool { + #ifdef CONFIG_COMPACTION + struct inode *inode; + struct work_struct free_work; +- /* A wait queue for when migration races with async_free_zspage() */ +- struct wait_queue_head migration_wait; +- atomic_long_t isolated_pages; +- bool destroying; + #endif ++ /* protect page/zspage migration */ ++ rwlock_t migrate_lock; + }; + + struct zspage { + struct { ++ unsigned int huge:HUGE_BITS; + unsigned int fullness:FULLNESS_BITS; + unsigned int class:CLASS_BITS + 1; + unsigned int isolated:ISOLATED_BITS; +@@ -293,17 +277,32 @@ struct zspage { }; struct mapping_area { @@ -9452,190 +9988,1094 @@ index b897ce3b399a..6a58c1df0cc7 100644 char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ -@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} + }; - static int create_cache(struct zs_pool *pool) ++/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ ++static void SetZsHugePage(struct zspage *zspage) ++{ ++ zspage->huge = 1; ++} ++ ++static bool ZsHugePage(struct zspage *zspage) ++{ ++ return zspage->huge; ++} ++ + #ifdef CONFIG_COMPACTION + static int zs_register_migration(struct zs_pool *pool); + static void zs_unregister_migration(struct zs_pool *pool); + static void migrate_lock_init(struct zspage *zspage); + static void migrate_read_lock(struct zspage *zspage); + static void migrate_read_unlock(struct zspage *zspage); ++static void migrate_write_lock(struct zspage *zspage); ++static void migrate_write_lock_nested(struct zspage *zspage); ++static void migrate_write_unlock(struct zspage *zspage); + static void kick_deferred_free(struct zs_pool *pool); + static void init_deferred_free(struct zs_pool *pool); + static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); +@@ -315,6 +314,9 @@ static void zs_unregister_migration(struct zs_pool *pool) {} + static void migrate_lock_init(struct zspage *zspage) {} + static void migrate_read_lock(struct zspage *zspage) {} + static void migrate_read_unlock(struct zspage *zspage) {} ++static void migrate_write_lock(struct zspage *zspage) {} ++static void migrate_write_lock_nested(struct zspage *zspage) {} ++static void migrate_write_unlock(struct zspage *zspage) {} + static void kick_deferred_free(struct zs_pool *pool) {} + static void init_deferred_free(struct zs_pool *pool) {} + static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} +@@ -366,14 +368,10 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) + kmem_cache_free(pool->zspage_cachep, zspage); + } + ++/* class->lock(which owns the handle) synchronizes races */ + static void record_obj(unsigned long handle, unsigned long obj) { -- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, -+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, - 0, 0, NULL); - if (!pool->handle_cachep) - return 1; -@@ -346,10 +362,27 @@ static void destroy_cache(struct zs_pool *pool) +- /* +- * lsb of @obj represents handle lock while other bits +- * represent object value the handle is pointing so +- * updating shouldn't do store tearing. +- */ +- WRITE_ONCE(*(unsigned long *)handle, obj); ++ *(unsigned long *)handle = obj; + } + + /* zpool driver */ +@@ -455,12 +453,9 @@ MODULE_ALIAS("zpool-zsmalloc"); + #endif /* CONFIG_ZPOOL */ - static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +-static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +- +-static bool is_zspage_isolated(struct zspage *zspage) +-{ +- return zspage->isolated; +-} ++static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static __maybe_unused int is_first_page(struct page *page) { -- return (unsigned long)kmem_cache_alloc(pool->handle_cachep, -- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); -+ void *p; +@@ -517,6 +512,12 @@ static void get_zspage_mapping(struct zspage *zspage, + *class_idx = zspage->class; + } + ++static struct size_class *zspage_class(struct zs_pool *pool, ++ struct zspage *zspage) ++{ ++ return pool->size_class[zspage->class]; ++} + -+ p = kmem_cache_alloc(pool->handle_cachep, -+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); -+#ifdef CONFIG_PREEMPT_RT -+ if (p) { -+ struct zsmalloc_handle *zh = p; + static void set_zspage_mapping(struct zspage *zspage, + unsigned int class_idx, + enum fullness_group fullness) +@@ -543,21 +544,21 @@ static int get_size_class_index(int size) + return min_t(int, ZS_SIZE_CLASSES - 1, idx); + } + +-/* type can be of enum type zs_stat_type or fullness_group */ +-static inline void zs_stat_inc(struct size_class *class, ++/* type can be of enum type class_stat_type or fullness_group */ ++static inline void class_stat_inc(struct size_class *class, + int type, unsigned long cnt) + { + class->stats.objs[type] += cnt; + } + +-/* type can be of enum type zs_stat_type or fullness_group */ +-static inline void zs_stat_dec(struct size_class *class, ++/* type can be of enum type class_stat_type or fullness_group */ ++static inline void class_stat_dec(struct size_class *class, + int type, unsigned long cnt) + { + class->stats.objs[type] -= cnt; + } + +-/* type can be of enum type zs_stat_type or fullness_group */ ++/* type can be of enum type class_stat_type or fullness_group */ + static inline unsigned long zs_stat_get(struct size_class *class, + int type) + { +@@ -719,7 +720,7 @@ static void insert_zspage(struct size_class *class, + { + struct zspage *head; + +- zs_stat_inc(class, fullness, 1); ++ class_stat_inc(class, fullness, 1); + head = list_first_entry_or_null(&class->fullness_list[fullness], + struct zspage, list); + /* +@@ -741,10 +742,9 @@ static void remove_zspage(struct size_class *class, + enum fullness_group fullness) + { + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); +- VM_BUG_ON(is_zspage_isolated(zspage)); + + list_del_init(&zspage->list); +- zs_stat_dec(class, fullness, 1); ++ class_stat_dec(class, fullness, 1); + } + + /* +@@ -767,13 +767,9 @@ static enum fullness_group fix_fullness_group(struct size_class *class, + if (newfg == currfg) + goto out; + +- if (!is_zspage_isolated(zspage)) { +- remove_zspage(class, zspage, currfg); +- insert_zspage(class, zspage, newfg); +- } +- ++ remove_zspage(class, zspage, currfg); ++ insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class_idx, newfg); +- + out: + return newfg; + } +@@ -824,7 +820,9 @@ static struct zspage *get_zspage(struct page *page) + + static struct page *get_next_page(struct page *page) + { +- if (unlikely(PageHugeObject(page))) ++ struct zspage *zspage = get_zspage(page); + -+ spin_lock_init(&zh->lock); -+ } -+#endif -+ return (unsigned long)p; ++ if (unlikely(ZsHugePage(zspage))) + return NULL; + + return page->freelist; +@@ -844,6 +842,12 @@ static void obj_to_location(unsigned long obj, struct page **page, + *obj_idx = (obj & OBJ_INDEX_MASK); } -+#ifdef CONFIG_PREEMPT_RT -+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) ++static void obj_to_page(unsigned long obj, struct page **page) +{ -+ return (void *)(handle & ~((1 << OBJ_TAG_BITS) - 1)); ++ obj >>= OBJ_TAG_BITS; ++ *page = pfn_to_page(obj >> OBJ_INDEX_BITS); ++} ++ + /** + * location_to_obj - get obj value encoded from (<page>, <obj_idx>) + * @page: page object resides in zspage +@@ -865,33 +869,22 @@ static unsigned long handle_to_obj(unsigned long handle) + return *(unsigned long *)handle; + } + +-static unsigned long obj_to_head(struct page *page, void *obj) ++static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) + { +- if (unlikely(PageHugeObject(page))) { ++ unsigned long handle; ++ struct zspage *zspage = get_zspage(page); ++ ++ if (unlikely(ZsHugePage(zspage))) { + VM_BUG_ON_PAGE(!is_first_page(page), page); +- return page->index; ++ handle = page->index; + } else +- return *(unsigned long *)obj; +-} +- +-static inline int testpin_tag(unsigned long handle) +-{ +- return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); +-} +- +-static inline int trypin_tag(unsigned long handle) +-{ +- return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); +-} ++ handle = *(unsigned long *)obj; + +-static void pin_tag(unsigned long handle) __acquires(bitlock) +-{ +- bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); +-} ++ if (!(handle & OBJ_ALLOCATED_TAG)) ++ return false; + +-static void unpin_tag(unsigned long handle) __releases(bitlock) +-{ +- bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); ++ *phandle = handle & ~OBJ_ALLOCATED_TAG; ++ return true; + } + + static void reset_page(struct page *page) +@@ -900,7 +893,6 @@ static void reset_page(struct page *page) + ClearPagePrivate(page); + set_page_private(page, 0); + page_mapcount_reset(page); +- ClearPageHugeObject(page); + page->freelist = NULL; + } + +@@ -952,7 +944,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, + + cache_free_zspage(pool, zspage); + +- zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); ++ class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + } +@@ -963,6 +955,11 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(list_empty(&zspage->list)); + ++ /* ++ * Since zs_free couldn't be sleepable, this function cannot call ++ * lock_page. The page locks trylock_zspage got will be released ++ * by __free_zspage. ++ */ + if (!trylock_zspage(zspage)) { + kick_deferred_free(pool); + return; +@@ -1042,7 +1039,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, + SetPagePrivate(page); + if (unlikely(class->objs_per_zspage == 1 && + class->pages_per_zspage == 1)) +- SetPageHugeObject(page); ++ SetZsHugePage(zspage); + } else { + prev_page->freelist = page; + } +@@ -1246,8 +1243,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + unsigned long obj, off; + unsigned int obj_idx; + +- unsigned int class_idx; +- enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + struct page *pages[2]; +@@ -1260,21 +1255,26 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + */ + BUG_ON(in_interrupt()); + +- /* From now on, migration cannot move the object */ +- pin_tag(handle); +- ++ /* It guarantees it can get zspage from handle safely */ ++ read_lock(&pool->migrate_lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + zspage = get_zspage(page); + +- /* migration cannot move any subpage in this zspage */ ++ /* ++ * migration cannot move any zpages in this zspage. Here, class->lock ++ * is too heavy since callers would take some time until they calls ++ * zs_unmap_object API so delegate the locking from class to zspage ++ * which is smaller granularity. ++ */ + migrate_read_lock(zspage); ++ read_unlock(&pool->migrate_lock); + +- get_zspage_mapping(zspage, &class_idx, &fg); +- class = pool->size_class[class_idx]; ++ class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; + +- area = &get_cpu_var(zs_map_area); ++ local_lock(&zs_map_area.lock); ++ area = this_cpu_ptr(&zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ +@@ -1290,7 +1290,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + + ret = __zs_map_object(area, pages, off, class->size); + out: +- if (likely(!PageHugeObject(page))) ++ if (likely(!ZsHugePage(zspage))) + ret += ZS_HANDLE_SIZE; + + return ret; +@@ -1304,16 +1304,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + unsigned long obj, off; + unsigned int obj_idx; + +- unsigned int class_idx; +- enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + zspage = get_zspage(page); +- get_zspage_mapping(zspage, &class_idx, &fg); +- class = pool->size_class[class_idx]; ++ class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; + + area = this_cpu_ptr(&zs_map_area); +@@ -1328,10 +1325,9 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + + __zs_unmap_object(area, pages, off, class->size); + } +- put_cpu_var(zs_map_area); ++ local_unlock(&zs_map_area.lock); + + migrate_read_unlock(zspage); +- unpin_tag(handle); + } + EXPORT_SYMBOL_GPL(zs_unmap_object); + +@@ -1354,17 +1350,19 @@ size_t zs_huge_class_size(struct zs_pool *pool) + } + EXPORT_SYMBOL_GPL(zs_huge_class_size); + +-static unsigned long obj_malloc(struct size_class *class, ++static unsigned long obj_malloc(struct zs_pool *pool, + struct zspage *zspage, unsigned long handle) + { + int i, nr_page, offset; + unsigned long obj; + struct link_free *link; ++ struct size_class *class; + + struct page *m_page; + unsigned long m_offset; + void *vaddr; + ++ class = pool->size_class[zspage->class]; + handle |= OBJ_ALLOCATED_TAG; + obj = get_freeobj(zspage); + +@@ -1379,7 +1377,7 @@ static unsigned long obj_malloc(struct size_class *class, + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + set_freeobj(zspage, link->next >> OBJ_TAG_BITS); +- if (likely(!PageHugeObject(m_page))) ++ if (likely(!ZsHugePage(zspage))) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else +@@ -1388,7 +1386,6 @@ static unsigned long obj_malloc(struct size_class *class, + + kunmap_atomic(vaddr); + mod_zspage_inuse(zspage, 1); +- zs_stat_inc(class, OBJ_USED, 1); + + obj = location_to_obj(m_page, obj); + +@@ -1424,13 +1421,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) + size += ZS_HANDLE_SIZE; + class = pool->size_class[get_size_class_index(size)]; + ++ /* class->lock effectively protects the zpage migration */ + spin_lock(&class->lock); + zspage = find_get_zspage(class); + if (likely(zspage)) { +- obj = obj_malloc(class, zspage, handle); ++ obj = obj_malloc(pool, zspage, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, zspage); + record_obj(handle, obj); ++ class_stat_inc(class, OBJ_USED, 1); + spin_unlock(&class->lock); + + return handle; +@@ -1445,14 +1444,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) + } + + spin_lock(&class->lock); +- obj = obj_malloc(class, zspage, handle); ++ obj = obj_malloc(pool, zspage, handle); + newfg = get_fullness_group(class, zspage); + insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class->index, newfg); + record_obj(handle, obj); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); +- zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); ++ class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); ++ class_stat_inc(class, OBJ_USED, 1); + + /* We completely set up zspage so mark them as movable */ + SetZsPageMovable(pool, zspage); +@@ -1462,7 +1462,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) + } + EXPORT_SYMBOL_GPL(zs_malloc); + +-static void obj_free(struct size_class *class, unsigned long obj) ++static void obj_free(int class_size, unsigned long obj) + { + struct link_free *link; + struct zspage *zspage; +@@ -1472,18 +1472,20 @@ static void obj_free(struct size_class *class, unsigned long obj) + void *vaddr; + + obj_to_location(obj, &f_page, &f_objidx); +- f_offset = (class->size * f_objidx) & ~PAGE_MASK; ++ f_offset = (class_size * f_objidx) & ~PAGE_MASK; + zspage = get_zspage(f_page); + + vaddr = kmap_atomic(f_page); + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)(vaddr + f_offset); +- link->next = get_freeobj(zspage) << OBJ_TAG_BITS; ++ if (likely(!ZsHugePage(zspage))) ++ link->next = get_freeobj(zspage) << OBJ_TAG_BITS; ++ else ++ f_page->index = 0; + kunmap_atomic(vaddr); + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); +- zs_stat_dec(class, OBJ_USED, 1); + } + + void zs_free(struct zs_pool *pool, unsigned long handle) +@@ -1491,42 +1493,33 @@ void zs_free(struct zs_pool *pool, unsigned long handle) + struct zspage *zspage; + struct page *f_page; + unsigned long obj; +- unsigned int f_objidx; +- int class_idx; + struct size_class *class; + enum fullness_group fullness; +- bool isolated; + + if (unlikely(!handle)) + return; + +- pin_tag(handle); ++ /* ++ * The pool->migrate_lock protects the race with zpage's migration ++ * so it's safe to get the page from handle. ++ */ ++ read_lock(&pool->migrate_lock); + obj = handle_to_obj(handle); +- obj_to_location(obj, &f_page, &f_objidx); ++ obj_to_page(obj, &f_page); + zspage = get_zspage(f_page); +- +- migrate_read_lock(zspage); +- +- get_zspage_mapping(zspage, &class_idx, &fullness); +- class = pool->size_class[class_idx]; +- ++ class = zspage_class(pool, zspage); + spin_lock(&class->lock); +- obj_free(class, obj); ++ read_unlock(&pool->migrate_lock); ++ ++ obj_free(class->size, obj); ++ class_stat_dec(class, OBJ_USED, 1); + fullness = fix_fullness_group(class, zspage); +- if (fullness != ZS_EMPTY) { +- migrate_read_unlock(zspage); ++ if (fullness != ZS_EMPTY) + goto out; +- } + +- isolated = is_zspage_isolated(zspage); +- migrate_read_unlock(zspage); +- /* If zspage is isolated, zs_page_putback will free the zspage */ +- if (likely(!isolated)) +- free_zspage(pool, class, zspage); ++ free_zspage(pool, class, zspage); + out: +- + spin_unlock(&class->lock); +- unpin_tag(handle); + cache_free_handle(pool, handle); + } + EXPORT_SYMBOL_GPL(zs_free); +@@ -1601,7 +1594,6 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, + static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) + { +- unsigned long head; + int offset = 0; + int index = *obj_idx; + unsigned long handle = 0; +@@ -1611,13 +1603,8 @@ static unsigned long find_alloced_obj(struct size_class *class, + offset += class->size * index; + + while (offset < PAGE_SIZE) { +- head = obj_to_head(page, addr + offset); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; +- if (trypin_tag(handle)) +- break; +- handle = 0; +- } ++ if (obj_allocated(page, addr + offset, &handle)) ++ break; + + offset += class->size; + index++; +@@ -1663,25 +1650,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + + /* Stop if there is no more space */ + if (zspage_full(class, get_zspage(d_page))) { +- unpin_tag(handle); + ret = -ENOMEM; + break; + } + + used_obj = handle_to_obj(handle); +- free_obj = obj_malloc(class, get_zspage(d_page), handle); ++ free_obj = obj_malloc(pool, get_zspage(d_page), handle); + zs_object_copy(class, free_obj, used_obj); + obj_idx++; +- /* +- * record_obj updates handle's value to free_obj and it will +- * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which +- * breaks synchronization using pin_tag(e,g, zs_free) so +- * let's keep the lock bit. +- */ +- free_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, free_obj); +- unpin_tag(handle); +- obj_free(class, used_obj); ++ obj_free(class->size, used_obj); + } + + /* Remember last position in this iteration */ +@@ -1706,7 +1684,6 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source) + zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], + struct zspage, list); + if (zspage) { +- VM_BUG_ON(is_zspage_isolated(zspage)); + remove_zspage(class, zspage, fg[i]); + return zspage; + } +@@ -1727,8 +1704,6 @@ static enum fullness_group putback_zspage(struct size_class *class, + { + enum fullness_group fullness; + +- VM_BUG_ON(is_zspage_isolated(zspage)); +- + fullness = get_fullness_group(class, zspage); + insert_zspage(class, zspage, fullness); + set_zspage_mapping(zspage, class->index, fullness); +@@ -1797,6 +1772,11 @@ static void migrate_write_lock(struct zspage *zspage) + write_lock(&zspage->lock); + } + ++static void migrate_write_lock_nested(struct zspage *zspage) ++{ ++ write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); +} -+#endif + - static void cache_free_handle(struct zs_pool *pool, unsigned long handle) + static void migrate_write_unlock(struct zspage *zspage) + { + write_unlock(&zspage->lock); +@@ -1810,35 +1790,10 @@ static void inc_zspage_isolation(struct zspage *zspage) + + static void dec_zspage_isolation(struct zspage *zspage) + { ++ VM_BUG_ON(zspage->isolated == 0); + zspage->isolated--; + } + +-static void putback_zspage_deferred(struct zs_pool *pool, +- struct size_class *class, +- struct zspage *zspage) +-{ +- enum fullness_group fg; +- +- fg = putback_zspage(class, zspage); +- if (fg == ZS_EMPTY) +- schedule_work(&pool->free_work); +- +-} +- +-static inline void zs_pool_dec_isolated(struct zs_pool *pool) +-{ +- VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); +- atomic_long_dec(&pool->isolated_pages); +- /* +- * Checking pool->destroying must happen after atomic_long_dec() +- * for pool->isolated_pages above. Paired with the smp_mb() in +- * zs_unregister_migration(). +- */ +- smp_mb__after_atomic(); +- if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) +- wake_up_all(&pool->migration_wait); +-} +- + static void replace_sub_page(struct size_class *class, struct zspage *zspage, + struct page *newpage, struct page *oldpage) + { +@@ -1857,19 +1812,14 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, + + create_page_chain(class, zspage, pages); + set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); +- if (unlikely(PageHugeObject(oldpage))) ++ if (unlikely(ZsHugePage(zspage))) + newpage->index = oldpage->index; + __SetPageMovable(newpage, page_mapping(oldpage)); + } + + static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { - kmem_cache_free(pool->handle_cachep, (void *)handle); -@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +- struct zs_pool *pool; +- struct size_class *class; +- int class_idx; +- enum fullness_group fullness; + struct zspage *zspage; +- struct address_space *mapping; + + /* + * Page is locked so zspage couldn't be destroyed. For detail, look at +@@ -1879,41 +1829,9 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) + VM_BUG_ON_PAGE(PageIsolated(page), page); + + zspage = get_zspage(page); +- +- /* +- * Without class lock, fullness could be stale while class_idx is okay +- * because class_idx is constant unless page is freed so we should get +- * fullness again under class lock. +- */ +- get_zspage_mapping(zspage, &class_idx, &fullness); +- mapping = page_mapping(page); +- pool = mapping->private_data; +- class = pool->size_class[class_idx]; +- +- spin_lock(&class->lock); +- if (get_zspage_inuse(zspage) == 0) { +- spin_unlock(&class->lock); +- return false; +- } +- +- /* zspage is isolated for object migration */ +- if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { +- spin_unlock(&class->lock); +- return false; +- } +- +- /* +- * If this is first time isolation for the zspage, isolate zspage from +- * size_class to prevent further object allocation from the zspage. +- */ +- if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { +- get_zspage_mapping(zspage, &class_idx, &fullness); +- atomic_long_inc(&pool->isolated_pages); +- remove_zspage(class, zspage, fullness); +- } +- ++ migrate_write_lock(zspage); + inc_zspage_isolation(zspage); +- spin_unlock(&class->lock); ++ migrate_write_unlock(zspage); + + return true; + } +@@ -1923,16 +1841,13 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, + { + struct zs_pool *pool; + struct size_class *class; +- int class_idx; +- enum fullness_group fullness; + struct zspage *zspage; + struct page *dummy; + void *s_addr, *d_addr, *addr; +- int offset, pos; +- unsigned long handle, head; ++ int offset; ++ unsigned long handle; + unsigned long old_obj, new_obj; + unsigned int obj_idx; +- int ret = -EAGAIN; + + /* + * We cannot support the _NO_COPY case here, because copy needs to +@@ -1945,35 +1860,25 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); - static void record_obj(unsigned long handle, unsigned long obj) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); +- zspage = get_zspage(page); +- +- /* Concurrent compactor cannot migrate any subpage in zspage */ +- migrate_write_lock(zspage); +- get_zspage_mapping(zspage, &class_idx, &fullness); + pool = mapping->private_data; +- class = pool->size_class[class_idx]; +- offset = get_first_obj_offset(page); + ++ /* ++ * The pool migrate_lock protects the race between zpage migration ++ * and zs_free. ++ */ ++ write_lock(&pool->migrate_lock); ++ zspage = get_zspage(page); ++ class = zspage_class(pool, zspage); + -+ WRITE_ONCE(zh->addr, obj); -+#else ++ /* ++ * the class lock protects zpage alloc/free in the zspage. ++ */ + spin_lock(&class->lock); +- if (!get_zspage_inuse(zspage)) { +- /* +- * Set "offset" to end of the page so that every loops +- * skips unnecessary object scanning. +- */ +- offset = PAGE_SIZE; +- } ++ /* the migrate_write_lock protects zpage access via zs_map_object */ ++ migrate_write_lock(zspage); + +- pos = offset; ++ offset = get_first_obj_offset(page); + s_addr = kmap_atomic(page); +- while (pos < PAGE_SIZE) { +- head = obj_to_head(page, s_addr + pos); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; +- if (!trypin_tag(handle)) +- goto unpin_objects; +- } +- pos += class->size; +- } + /* - * lsb of @obj represents handle lock while other bits - * represent object value the handle is pointing so - * updating shouldn't do store tearing. - */ - WRITE_ONCE(*(unsigned long *)handle, obj); -+#endif - } + * Here, any user cannot access all objects in the zspage so let's move. +@@ -1982,42 +1887,30 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, + memcpy(d_addr, s_addr, PAGE_SIZE); + kunmap_atomic(d_addr); + +- for (addr = s_addr + offset; addr < s_addr + pos; ++ for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; + addr += class->size) { +- head = obj_to_head(page, addr); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; +- BUG_ON(!testpin_tag(handle)); ++ if (obj_allocated(page, addr, &handle)) { + + old_obj = handle_to_obj(handle); + obj_to_location(old_obj, &dummy, &obj_idx); + new_obj = (unsigned long)location_to_obj(newpage, + obj_idx); +- new_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, new_obj); + } + } ++ kunmap_atomic(s_addr); - /* zpool driver */ -@@ -455,7 +494,9 @@ MODULE_ALIAS("zpool-zsmalloc"); - #endif /* CONFIG_ZPOOL */ + replace_sub_page(class, zspage, newpage, page); +- get_page(newpage); +- +- dec_zspage_isolation(zspage); +- + /* +- * Page migration is done so let's putback isolated zspage to +- * the list if @page is final isolated subpage in the zspage. ++ * Since we complete the data copy and set up new zspage structure, ++ * it's okay to release migration_lock. + */ +- if (!is_zspage_isolated(zspage)) { +- /* +- * We cannot race with zs_destroy_pool() here because we wait +- * for isolation to hit zero before we start destroying. +- * Also, we ensure that everyone can see pool->destroying before +- * we start waiting. +- */ +- putback_zspage_deferred(pool, class, zspage); +- zs_pool_dec_isolated(pool); +- } ++ write_unlock(&pool->migrate_lock); ++ spin_unlock(&class->lock); ++ dec_zspage_isolation(zspage); ++ migrate_write_unlock(zspage); + ++ get_page(newpage); + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); +@@ -2025,55 +1918,21 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, + + reset_page(page); + put_page(page); +- page = newpage; +- +- ret = MIGRATEPAGE_SUCCESS; +-unpin_objects: +- for (addr = s_addr + offset; addr < s_addr + pos; +- addr += class->size) { +- head = obj_to_head(page, addr); +- if (head & OBJ_ALLOCATED_TAG) { +- handle = head & ~OBJ_ALLOCATED_TAG; +- BUG_ON(!testpin_tag(handle)); +- unpin_tag(handle); +- } +- } +- kunmap_atomic(s_addr); +- spin_unlock(&class->lock); +- migrate_write_unlock(zspage); - /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ --static DEFINE_PER_CPU(struct mapping_area, zs_map_area); -+static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { -+ .lock = INIT_LOCAL_LOCK(lock), -+}; +- return ret; ++ return MIGRATEPAGE_SUCCESS; + } - static bool is_zspage_isolated(struct zspage *zspage) + static void zs_page_putback(struct page *page) { -@@ -862,7 +903,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +- struct zs_pool *pool; +- struct size_class *class; +- int class_idx; +- enum fullness_group fg; +- struct address_space *mapping; + struct zspage *zspage; - static unsigned long handle_to_obj(unsigned long handle) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return zh->addr; -+#else - return *(unsigned long *)handle; -+#endif - } + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); - static unsigned long obj_to_head(struct page *page, void *obj) -@@ -876,22 +923,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) + zspage = get_zspage(page); +- get_zspage_mapping(zspage, &class_idx, &fg); +- mapping = page_mapping(page); +- pool = mapping->private_data; +- class = pool->size_class[class_idx]; +- +- spin_lock(&class->lock); ++ migrate_write_lock(zspage); + dec_zspage_isolation(zspage); +- if (!is_zspage_isolated(zspage)) { +- /* +- * Due to page_lock, we cannot free zspage immediately +- * so let's defer. +- */ +- putback_zspage_deferred(pool, class, zspage); +- zs_pool_dec_isolated(pool); +- } +- spin_unlock(&class->lock); ++ migrate_write_unlock(zspage); + } - static inline int testpin_tag(unsigned long handle) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return spin_is_locked(&zh->lock); -+#else - return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif + static const struct address_space_operations zsmalloc_aops = { +@@ -2095,36 +1954,8 @@ static int zs_register_migration(struct zs_pool *pool) + return 0; } - static inline int trypin_tag(unsigned long handle) +-static bool pool_isolated_are_drained(struct zs_pool *pool) +-{ +- return atomic_long_read(&pool->isolated_pages) == 0; +-} +- +-/* Function for resolving migration */ +-static void wait_for_isolated_drain(struct zs_pool *pool) +-{ +- +- /* +- * We're in the process of destroying the pool, so there are no +- * active allocations. zs_page_isolate() fails for completely free +- * zspages, so we need only wait for the zs_pool's isolated +- * count to hit zero. +- */ +- wait_event(pool->migration_wait, +- pool_isolated_are_drained(pool)); +-} +- + static void zs_unregister_migration(struct zs_pool *pool) { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); -+ -+ return spin_trylock(&zh->lock); -+#else - return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif +- pool->destroying = true; +- /* +- * We need a memory barrier here to ensure global visibility of +- * pool->destroying. Thus pool->isolated pages will either be 0 in which +- * case we don't care, or it will be > 0 and pool->destroying will +- * ensure that we wake up once isolation hits 0. +- */ +- smp_mb(); +- wait_for_isolated_drain(pool); /* This can block */ + flush_work(&pool->free_work); + iput(pool->inode); } +@@ -2154,7 +1985,6 @@ static void async_free_zspage(struct work_struct *work) + spin_unlock(&class->lock); + } - static void pin_tag(unsigned long handle) __acquires(bitlock) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); +- + list_for_each_entry_safe(zspage, tmp, &free_pages, list) { + list_del(&zspage->list); + lock_zspage(zspage); +@@ -2218,8 +2048,13 @@ static unsigned long __zs_compact(struct zs_pool *pool, + struct zspage *dst_zspage = NULL; + unsigned long pages_freed = 0; + ++ /* protect the race between zpage migration and zs_free */ ++ write_lock(&pool->migrate_lock); ++ /* protect zpage allocation/free */ + spin_lock(&class->lock); + while ((src_zspage = isolate_zspage(class, true))) { ++ /* protect someone accessing the zspage(i.e., zs_map_object) */ ++ migrate_write_lock(src_zspage); + + if (!zs_can_compact(class)) + break; +@@ -2228,6 +2063,8 @@ static unsigned long __zs_compact(struct zs_pool *pool, + cc.s_page = get_first_page(src_zspage); + + while ((dst_zspage = isolate_zspage(class, false))) { ++ migrate_write_lock_nested(dst_zspage); + -+ return spin_lock(&zh->lock); -+#else - bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif - } + cc.d_page = get_first_page(dst_zspage); + /* + * If there is no more space in dst_page, resched +@@ -2237,6 +2074,10 @@ static unsigned long __zs_compact(struct zs_pool *pool, + break; + + putback_zspage(class, dst_zspage); ++ migrate_write_unlock(dst_zspage); ++ dst_zspage = NULL; ++ if (rwlock_is_contended(&pool->migrate_lock)) ++ break; + } - static void unpin_tag(unsigned long handle) __releases(bitlock) - { -+#ifdef CONFIG_PREEMPT_RT -+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + /* Stop if we couldn't find slot */ +@@ -2244,19 +2085,28 @@ static unsigned long __zs_compact(struct zs_pool *pool, + break; + + putback_zspage(class, dst_zspage); ++ migrate_write_unlock(dst_zspage); + -+ return spin_unlock(&zh->lock); -+#else - bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); -+#endif - } + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { ++ migrate_write_unlock(src_zspage); + free_zspage(pool, class, src_zspage); + pages_freed += class->pages_per_zspage; +- } ++ } else ++ migrate_write_unlock(src_zspage); + spin_unlock(&class->lock); ++ write_unlock(&pool->migrate_lock); + cond_resched(); ++ write_lock(&pool->migrate_lock); + spin_lock(&class->lock); + } - static void reset_page(struct page *page) -@@ -1274,7 +1345,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, - class = pool->size_class[class_idx]; - off = (class->size * obj_idx) & ~PAGE_MASK; +- if (src_zspage) ++ if (src_zspage) { + putback_zspage(class, src_zspage); ++ migrate_write_unlock(src_zspage); ++ } -- area = &get_cpu_var(zs_map_area); -+ local_lock(&zs_map_area.lock); -+ area = this_cpu_ptr(&zs_map_area); - area->vm_mm = mm; - if (off + class->size <= PAGE_SIZE) { - /* this object is contained entirely within a page */ -@@ -1328,7 +1400,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + spin_unlock(&class->lock); ++ write_unlock(&pool->migrate_lock); - __zs_unmap_object(area, pages, off, class->size); - } -- put_cpu_var(zs_map_area); -+ local_unlock(&zs_map_area.lock); + return pages_freed; + } +@@ -2362,15 +2212,12 @@ struct zs_pool *zs_create_pool(const char *name) + return NULL; - migrate_read_unlock(zspage); - unpin_tag(handle); -diff --git a/net/Kconfig b/net/Kconfig -index fb13460c6dab..074472dfa94a 100644 ---- a/net/Kconfig -+++ b/net/Kconfig -@@ -294,7 +294,7 @@ config CGROUP_NET_CLASSID - - config NET_RX_BUSY_POLL - bool -- default y -+ default y if !PREEMPT_RT + init_deferred_free(pool); ++ rwlock_init(&pool->migrate_lock); + + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) + goto err; + +-#ifdef CONFIG_COMPACTION +- init_waitqueue_head(&pool->migration_wait); +-#endif +- + if (create_cache(pool)) + goto err; - config BQL - bool diff --git a/net/core/dev.c b/net/core/dev.c -index 91f53eeb0e79..f39077436e9d 100644 +index c4708e2487fb..daed8ce42db1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c -@@ -225,14 +225,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) - static inline void rps_lock(struct softnet_data *sd) +@@ -222,18 +222,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; + } + +-static inline void rps_lock(struct softnet_data *sd) ++static inline void rps_lock_irqsave(struct softnet_data *sd, ++ unsigned long *flags) { - #ifdef CONFIG_RPS +-#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -+ raw_spin_lock(&sd->input_pkt_queue.raw_lock); - #endif +-#endif ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_save(*flags); } - static inline void rps_unlock(struct softnet_data *sd) +-static inline void rps_unlock(struct softnet_data *sd) ++static inline void rps_lock_irq_disable(struct softnet_data *sd) { - #ifdef CONFIG_RPS +-#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); -+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock); - #endif +-#endif ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_lock_irq(&sd->input_pkt_queue.lock); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); ++} ++ ++static inline void rps_unlock_irq_restore(struct softnet_data *sd, ++ unsigned long *flags) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_restore(*flags); ++} ++ ++static inline void rps_unlock_irq_enable(struct softnet_data *sd) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_unlock_irq(&sd->input_pkt_queue.lock); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + } + + static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, +@@ -371,12 +391,12 @@ static void list_netdevice(struct net_device *dev) + + ASSERT_RTNL(); + +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); + netdev_name_node_add(net, dev->name_node); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); + + dev_base_seq_inc(net); } +@@ -389,11 +409,11 @@ static void unlist_netdevice(struct net_device *dev) + ASSERT_RTNL(); -@@ -3042,6 +3042,7 @@ static void __netif_reschedule(struct Qdisc *q) + /* Unlink dev from the device chain */ +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + list_del_rcu(&dev->dev_list); + netdev_name_node_del(dev->name_node); + hlist_del_rcu(&dev->index_hlist); +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); + + dev_base_seq_inc(dev_net(dev)); + } +@@ -1272,15 +1292,15 @@ int dev_change_name(struct net_device *dev, const char *newname) + + netdev_adjacent_rename_links(dev, oldname); + +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + netdev_name_node_del(dev->name_node); +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); + + synchronize_rcu(); + +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + netdev_name_node_add(net, dev->name_node); +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); + + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); + ret = notifier_to_errno(ret); +@@ -3048,6 +3068,7 @@ static void __netif_reschedule(struct Qdisc *q) sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); @@ -9643,7 +11083,7 @@ index 91f53eeb0e79..f39077436e9d 100644 } void __netif_schedule(struct Qdisc *q) -@@ -3104,6 +3105,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +@@ -3110,6 +3131,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); @@ -9651,41 +11091,80 @@ index 91f53eeb0e79..f39077436e9d 100644 } EXPORT_SYMBOL(__dev_kfree_skb_irq); -@@ -3831,7 +3833,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, +@@ -3836,8 +3858,12 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. ++ * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit ++ * and then other tasks will only enqueue packets. The packets will be ++ * sent after the qdisc owner is scheduled again. To prevent this ++ * scenario the task always serialize on the lock. */ -+#ifdef CONFIG_PREEMPT_RT -+ contended = true; -+#else - contended = qdisc_is_running(q); -+#endif +- contended = qdisc_is_running(q); ++ contended = IS_ENABLED(CONFIG_PREEMPT_RT) || qdisc_is_running(q); if (unlikely(contended)) spin_lock(&q->busylock); -@@ -4656,6 +4662,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, - rps_unlock(sd); +@@ -4640,9 +4666,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, - local_irq_restore(flags); -+ preempt_check_resched_rt(); + sd = &per_cpu(softnet_data, cpu); + +- local_irq_save(flags); +- +- rps_lock(sd); ++ rps_lock_irqsave(sd, &flags); + if (!netif_running(skb->dev)) + goto drop; + qlen = skb_queue_len(&sd->input_pkt_queue); +@@ -4651,26 +4675,30 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + enqueue: + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); +- rps_unlock(sd); +- local_irq_restore(flags); ++ rps_unlock_irq_restore(sd, &flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock ++ * PREEMPT_RT needs to disable interrupts here for ++ * synchronisation needed in napi_schedule. + */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); ++ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { + if (!rps_ipi_queued(sd)) + ____napi_schedule(sd, &sd->backlog); + } ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + goto enqueue; + } + + drop: + sd->dropped++; +- rps_unlock(sd); +- +- local_irq_restore(flags); ++ rps_unlock_irq_restore(sd, &flags); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); -@@ -4896,7 +4903,7 @@ static int netif_rx_internal(struct sk_buff *skb) +@@ -4911,7 +4939,6 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); -+ migrate_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); -@@ -4906,14 +4913,14 @@ static int netif_rx_internal(struct sk_buff *skb) +@@ -4921,14 +4948,12 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); -+ migrate_enable(); } else #endif { @@ -9693,26 +11172,100 @@ index 91f53eeb0e79..f39077436e9d 100644 - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); -+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); -+ put_cpu_light(); ++ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } -@@ -4952,11 +4959,9 @@ int netif_rx_ni(struct sk_buff *skb) +@@ -4952,47 +4977,17 @@ int netif_rx(struct sk_buff *skb) + { + int ret; + ++ local_bh_disable(); + trace_netif_rx_entry(skb); - trace_netif_rx_ni_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); ++ local_bh_enable(); + + return ret; + } + EXPORT_SYMBOL(netif_rx); +-int netif_rx_ni(struct sk_buff *skb) +-{ +- int err; +- +- trace_netif_rx_ni_entry(skb); +- - preempt_disable(); -+ local_bh_disable(); - err = netif_rx_internal(skb); +- err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); -+ local_bh_enable(); - trace_netif_rx_ni_exit(err); +- trace_netif_rx_ni_exit(err); +- +- return err; +-} +-EXPORT_SYMBOL(netif_rx_ni); +- +-int netif_rx_any_context(struct sk_buff *skb) +-{ +- /* +- * If invoked from contexts which do not invoke bottom half +- * processing either at return from interrupt or when softrqs are +- * reenabled, use netif_rx_ni() which invokes bottomhalf processing +- * directly. +- */ +- if (in_interrupt()) +- return netif_rx(skb); +- else +- return netif_rx_ni(skb); +-} +-EXPORT_SYMBOL(netif_rx_any_context); +- + static __latent_entropy void net_tx_action(struct softirq_action *h) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); +@@ -5764,8 +5759,7 @@ static void flush_backlog(struct work_struct *work) + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); - return err; -@@ -6399,12 +6404,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) +- local_irq_disable(); +- rps_lock(sd); ++ rps_lock_irq_disable(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { + __skb_unlink(skb, &sd->input_pkt_queue); +@@ -5773,8 +5767,7 @@ static void flush_backlog(struct work_struct *work) + input_queue_head_incr(sd); + } + } +- rps_unlock(sd); +- local_irq_enable(); ++ rps_unlock_irq_enable(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { +@@ -5792,16 +5785,14 @@ static bool flush_required(int cpu) + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + bool do_flush; + +- local_irq_disable(); +- rps_lock(sd); ++ rps_lock_irq_disable(sd); + + /* as insertion into process_queue happens with the rps lock held, + * process_queue access may race only with dequeue + */ + do_flush = !skb_queue_empty(&sd->input_pkt_queue) || + !skb_queue_empty_lockless(&sd->process_queue); +- rps_unlock(sd); +- local_irq_enable(); ++ rps_unlock_irq_enable(sd); + + return do_flush; + #endif +@@ -6415,12 +6406,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) sd->rps_ipi_list = NULL; local_irq_enable(); @@ -9720,493 +11273,124 @@ index 91f53eeb0e79..f39077436e9d 100644 /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); - } else - #endif - local_irq_enable(); -+ preempt_check_resched_rt(); - } - - static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) -@@ -6482,6 +6489,7 @@ void __napi_schedule(struct napi_struct *n) - local_irq_save(flags); - ____napi_schedule(this_cpu_ptr(&softnet_data), n); - local_irq_restore(flags); -+ preempt_check_resched_rt(); - } - EXPORT_SYMBOL(__napi_schedule); - -@@ -11304,6 +11312,7 @@ static int dev_cpu_dead(unsigned int oldcpu) - - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_enable(); -+ preempt_check_resched_rt(); - - #ifdef CONFIG_RPS - remsd = oldsd->rps_ipi_list; -@@ -11317,7 +11326,7 @@ static int dev_cpu_dead(unsigned int oldcpu) - netif_rx_ni(skb); - input_queue_head_incr(oldsd); - } -- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { -+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx_ni(skb); - input_queue_head_incr(oldsd); - } -@@ -11633,7 +11642,7 @@ static int __init net_dev_init(void) - - INIT_WORK(flush, flush_backlog); - -- skb_queue_head_init(&sd->input_pkt_queue); -+ skb_queue_head_init_raw(&sd->input_pkt_queue); - skb_queue_head_init(&sd->process_queue); - #ifdef CONFIG_XFRM_OFFLOAD - skb_queue_head_init(&sd->xfrm_backlog); -diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c -index 8e582e29a41e..4fcbdd71c59f 100644 ---- a/net/core/gen_estimator.c -+++ b/net/core/gen_estimator.c -@@ -40,10 +40,10 @@ - */ - - struct net_rate_estimator { -- struct gnet_stats_basic_packed *bstats; -+ struct gnet_stats_basic_sync *bstats; - spinlock_t *stats_lock; -- seqcount_t *running; -- struct gnet_stats_basic_cpu __percpu *cpu_bstats; -+ bool running; -+ struct gnet_stats_basic_sync __percpu *cpu_bstats; - u8 ewma_log; - u8 intvl_log; /* period : (250ms << intvl_log) */ - -@@ -60,13 +60,13 @@ struct net_rate_estimator { - }; - - static void est_fetch_counters(struct net_rate_estimator *e, -- struct gnet_stats_basic_packed *b) -+ struct gnet_stats_basic_sync *b) - { -- memset(b, 0, sizeof(*b)); -+ gnet_stats_basic_sync_init(b); - if (e->stats_lock) - spin_lock(e->stats_lock); - -- __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); -+ gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); - - if (e->stats_lock) - spin_unlock(e->stats_lock); -@@ -76,14 +76,18 @@ static void est_fetch_counters(struct net_rate_estimator *e, - static void est_timer(struct timer_list *t) - { - struct net_rate_estimator *est = from_timer(est, t, timer); -- struct gnet_stats_basic_packed b; -+ struct gnet_stats_basic_sync b; -+ u64 b_bytes, b_packets; - u64 rate, brate; - - est_fetch_counters(est, &b); -- brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); -+ b_bytes = u64_stats_read(&b.bytes); -+ b_packets = u64_stats_read(&b.packets); -+ -+ brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); - brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - -- rate = (b.packets - est->last_packets) << (10 - est->intvl_log); -+ rate = (b_packets - est->last_packets) << (10 - est->intvl_log); - rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); - - write_seqcount_begin(&est->seq); -@@ -91,8 +95,8 @@ static void est_timer(struct timer_list *t) - est->avpps += rate; - write_seqcount_end(&est->seq); - -- est->last_bytes = b.bytes; -- est->last_packets = b.packets; -+ est->last_bytes = b_bytes; -+ est->last_packets = b_packets; - - est->next_jiffies += ((HZ/4) << est->intvl_log); - -@@ -109,7 +113,9 @@ static void est_timer(struct timer_list *t) - * @cpu_bstats: bstats per cpu - * @rate_est: rate estimator statistics - * @lock: lock for statistics and control path -- * @running: qdisc running seqcount -+ * @running: true if @bstats represents a running qdisc, thus @bstats' -+ * internal values might change during basic reads. Only used -+ * if @bstats_cpu is NULL - * @opt: rate estimator configuration TLV - * - * Creates a new rate estimator with &bstats as source and &rate_est -@@ -121,16 +127,16 @@ static void est_timer(struct timer_list *t) - * Returns 0 on success or a negative error code. - * - */ --int gen_new_estimator(struct gnet_stats_basic_packed *bstats, -- struct gnet_stats_basic_cpu __percpu *cpu_bstats, -+int gen_new_estimator(struct gnet_stats_basic_sync *bstats, -+ struct gnet_stats_basic_sync __percpu *cpu_bstats, - struct net_rate_estimator __rcu **rate_est, - spinlock_t *lock, -- seqcount_t *running, -+ bool running, - struct nlattr *opt) - { - struct gnet_estimator *parm = nla_data(opt); - struct net_rate_estimator *old, *est; -- struct gnet_stats_basic_packed b; -+ struct gnet_stats_basic_sync b; - int intvl_log; - - if (nla_len(opt) < sizeof(*parm)) -@@ -164,8 +170,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - est_fetch_counters(est, &b); - if (lock) - local_bh_enable(); -- est->last_bytes = b.bytes; -- est->last_packets = b.packets; -+ est->last_bytes = u64_stats_read(&b.bytes); -+ est->last_packets = u64_stats_read(&b.packets); - - if (lock) - spin_lock_bh(lock); -@@ -214,7 +220,9 @@ EXPORT_SYMBOL(gen_kill_estimator); - * @cpu_bstats: bstats per cpu - * @rate_est: rate estimator statistics - * @lock: lock for statistics and control path -- * @running: qdisc running seqcount (might be NULL) -+ * @running: true if @bstats represents a running qdisc, thus @bstats' -+ * internal values might change during basic reads. Only used -+ * if @cpu_bstats is NULL - * @opt: rate estimator configuration TLV - * - * Replaces the configuration of a rate estimator by calling -@@ -222,11 +230,11 @@ EXPORT_SYMBOL(gen_kill_estimator); - * - * Returns 0 on success or a negative error code. - */ --int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, -- struct gnet_stats_basic_cpu __percpu *cpu_bstats, -+int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, -+ struct gnet_stats_basic_sync __percpu *cpu_bstats, - struct net_rate_estimator __rcu **rate_est, - spinlock_t *lock, -- seqcount_t *running, struct nlattr *opt) -+ bool running, struct nlattr *opt) - { - return gen_new_estimator(bstats, cpu_bstats, rate_est, - lock, running, opt); -diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c -index e491b083b348..a10335b4ba2d 100644 ---- a/net/core/gen_stats.c -+++ b/net/core/gen_stats.c -@@ -18,7 +18,7 @@ - #include <linux/gen_stats.h> - #include <net/netlink.h> - #include <net/gen_stats.h> -- -+#include <net/sch_generic.h> - - static inline int - gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) -@@ -114,63 +114,112 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, - } - EXPORT_SYMBOL(gnet_stats_start_copy); - --static void --__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, -- struct gnet_stats_basic_cpu __percpu *cpu) -+/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ -+void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) - { -+ u64_stats_set(&b->bytes, 0); -+ u64_stats_set(&b->packets, 0); -+ u64_stats_init(&b->syncp); -+} -+EXPORT_SYMBOL(gnet_stats_basic_sync_init); -+ -+static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, -+ struct gnet_stats_basic_sync __percpu *cpu) -+{ -+ u64 t_bytes = 0, t_packets = 0; - int i; - - for_each_possible_cpu(i) { -- struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); -+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); - unsigned int start; - u64 bytes, packets; - - do { - start = u64_stats_fetch_begin_irq(&bcpu->syncp); -- bytes = bcpu->bstats.bytes; -- packets = bcpu->bstats.packets; -+ bytes = u64_stats_read(&bcpu->bytes); -+ packets = u64_stats_read(&bcpu->packets); - } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); - -- bstats->bytes += bytes; -- bstats->packets += packets; -+ t_bytes += bytes; -+ t_packets += packets; -+ } -+ _bstats_update(bstats, t_bytes, t_packets); -+} -+ -+void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, bool running) -+{ -+ unsigned int start; -+ u64 bytes = 0; -+ u64 packets = 0; -+ -+ WARN_ON_ONCE((cpu || running) && in_hardirq()); -+ -+ if (cpu) { -+ gnet_stats_add_basic_cpu(bstats, cpu); -+ return; - } -+ do { -+ if (running) -+ start = u64_stats_fetch_begin_irq(&b->syncp); -+ bytes = u64_stats_read(&b->bytes); -+ packets = u64_stats_read(&b->packets); -+ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); -+ -+ _bstats_update(bstats, bytes, packets); - } -+EXPORT_SYMBOL(gnet_stats_add_basic); - --void --__gnet_stats_copy_basic(const seqcount_t *running, -- struct gnet_stats_basic_packed *bstats, -- struct gnet_stats_basic_cpu __percpu *cpu, -- struct gnet_stats_basic_packed *b) -+static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, bool running) - { -- unsigned int seq; -+ unsigned int start; - - if (cpu) { -- __gnet_stats_copy_basic_cpu(bstats, cpu); -+ u64 t_bytes = 0, t_packets = 0; -+ int i; -+ -+ for_each_possible_cpu(i) { -+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); -+ unsigned int start; -+ u64 bytes, packets; -+ -+ do { -+ start = u64_stats_fetch_begin_irq(&bcpu->syncp); -+ bytes = u64_stats_read(&bcpu->bytes); -+ packets = u64_stats_read(&bcpu->packets); -+ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); -+ -+ t_bytes += bytes; -+ t_packets += packets; -+ } -+ *ret_bytes = t_bytes; -+ *ret_packets = t_packets; - return; - } - do { - if (running) -- seq = read_seqcount_begin(running); -- bstats->bytes = b->bytes; -- bstats->packets = b->packets; -- } while (running && read_seqcount_retry(running, seq)); -+ start = u64_stats_fetch_begin_irq(&b->syncp); -+ *ret_bytes = u64_stats_read(&b->bytes); -+ *ret_packets = u64_stats_read(&b->packets); -+ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); - } --EXPORT_SYMBOL(__gnet_stats_copy_basic); + } else + #endif + local_irq_enable(); ++ preempt_check_resched_rt(); + } - static int --___gnet_stats_copy_basic(const seqcount_t *running, -- struct gnet_dump *d, -- struct gnet_stats_basic_cpu __percpu *cpu, -- struct gnet_stats_basic_packed *b, -- int type) -+___gnet_stats_copy_basic(struct gnet_dump *d, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, -+ int type, bool running) - { -- struct gnet_stats_basic_packed bstats = {0}; -+ u64 bstats_bytes, bstats_packets; - -- __gnet_stats_copy_basic(running, &bstats, cpu, b); -+ gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); - - if (d->compat_tc_stats && type == TCA_STATS_BASIC) { -- d->tc_stats.bytes = bstats.bytes; -- d->tc_stats.packets = bstats.packets; -+ d->tc_stats.bytes = bstats_bytes; -+ d->tc_stats.packets = bstats_packets; - } + static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +@@ -6460,8 +6453,7 @@ static int process_backlog(struct napi_struct *napi, int quota) - if (d->tail) { -@@ -178,24 +227,28 @@ ___gnet_stats_copy_basic(const seqcount_t *running, - int res; - - memset(&sb, 0, sizeof(sb)); -- sb.bytes = bstats.bytes; -- sb.packets = bstats.packets; -+ sb.bytes = bstats_bytes; -+ sb.packets = bstats_packets; - res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); -- if (res < 0 || sb.packets == bstats.packets) -+ if (res < 0 || sb.packets == bstats_packets) - return res; - /* emit 64bit stats only if needed */ -- return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, -- sizeof(bstats.packets), TCA_STATS_PAD); -+ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, -+ sizeof(bstats_packets), TCA_STATS_PAD); + } + +- local_irq_disable(); +- rps_lock(sd); ++ rps_lock_irq_disable(sd); + if (skb_queue_empty(&sd->input_pkt_queue)) { + /* + * Inline a custom version of __napi_complete(). +@@ -6477,8 +6469,7 @@ static int process_backlog(struct napi_struct *napi, int quota) + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + } +- rps_unlock(sd); +- local_irq_enable(); ++ rps_unlock_irq_enable(sd); } - return 0; + + return work; +@@ -6498,6 +6489,7 @@ void __napi_schedule(struct napi_struct *n) + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); ++ preempt_check_resched_rt(); } + EXPORT_SYMBOL(__napi_schedule); - /** - * gnet_stats_copy_basic - copy basic statistics into statistic TLV -- * @running: seqcount_t pointer - * @d: dumping handle - * @cpu: copy statistic per cpu - * @b: basic statistics -+ * @running: true if @b represents a running qdisc, thus @b's -+ * internal values might change during basic reads. -+ * Only used if @cpu is NULL -+ * -+ * Context: task; must not be run from IRQ or BH contexts - * - * Appends the basic statistics to the top level TLV created by - * gnet_stats_start_copy(). -@@ -204,22 +257,25 @@ ___gnet_stats_copy_basic(const seqcount_t *running, - * if the room in the socket buffer was not sufficient. - */ - int --gnet_stats_copy_basic(const seqcount_t *running, -- struct gnet_dump *d, -- struct gnet_stats_basic_cpu __percpu *cpu, -- struct gnet_stats_basic_packed *b) -+gnet_stats_copy_basic(struct gnet_dump *d, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, -+ bool running) - { -- return ___gnet_stats_copy_basic(running, d, cpu, b, -- TCA_STATS_BASIC); -+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); - } - EXPORT_SYMBOL(gnet_stats_copy_basic); +@@ -11329,6 +11321,7 @@ static int dev_cpu_dead(unsigned int oldcpu) - /** - * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV -- * @running: seqcount_t pointer - * @d: dumping handle - * @cpu: copy statistic per cpu - * @b: basic statistics -+ * @running: true if @b represents a running qdisc, thus @b's -+ * internal values might change during basic reads. -+ * Only used if @cpu is NULL -+ * -+ * Context: task; must not be run from IRQ or BH contexts - * - * Appends the basic statistics to the top level TLV created by - * gnet_stats_start_copy(). -@@ -228,13 +284,12 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); - * if the room in the socket buffer was not sufficient. - */ - int --gnet_stats_copy_basic_hw(const seqcount_t *running, -- struct gnet_dump *d, -- struct gnet_stats_basic_cpu __percpu *cpu, -- struct gnet_stats_basic_packed *b) -+gnet_stats_copy_basic_hw(struct gnet_dump *d, -+ struct gnet_stats_basic_sync __percpu *cpu, -+ struct gnet_stats_basic_sync *b, -+ bool running) - { -- return ___gnet_stats_copy_basic(running, d, cpu, b, -- TCA_STATS_BASIC_HW); -+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); - } - EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); ++ preempt_check_resched_rt(); + + #ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; +diff --git a/net/core/link_watch.c b/net/core/link_watch.c +index 1a455847da54..9599afd0862d 100644 +--- a/net/core/link_watch.c ++++ b/net/core/link_watch.c +@@ -55,7 +55,7 @@ static void rfc2863_policy(struct net_device *dev) + if (operstate == dev->operstate) + return; + +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + + switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: +@@ -74,7 +74,7 @@ static void rfc2863_policy(struct net_device *dev) -@@ -282,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, + dev->operstate = operstate; + +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); } - EXPORT_SYMBOL(gnet_stats_copy_rate_est); --static void --__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, -- const struct gnet_stats_queue __percpu *q) -+static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, -+ const struct gnet_stats_queue __percpu *q) - { - int i; - for_each_possible_cpu(i) { - const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); +diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c +index 2af8aeeadadf..716be2f88cd7 100644 +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -842,9 +842,9 @@ static void set_operstate(struct net_device *dev, unsigned char transition) + } -- qstats->qlen = 0; -+ qstats->qlen += qcpu->backlog; - qstats->backlog += qcpu->backlog; - qstats->drops += qcpu->drops; - qstats->requeues += qcpu->requeues; -@@ -299,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, + if (dev->operstate != operstate) { +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + dev->operstate = operstate; +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); + netdev_state_change(dev); } } +@@ -2779,11 +2779,11 @@ static int do_setlink(const struct sk_buff *skb, + if (tb[IFLA_LINKMODE]) { + unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); + +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + if (dev->link_mode ^ value) + status |= DO_SETLINK_NOTIFY; + dev->link_mode = value; +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); + } --void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, -- const struct gnet_stats_queue __percpu *cpu, -- const struct gnet_stats_queue *q, -- __u32 qlen) -+void gnet_stats_add_queue(struct gnet_stats_queue *qstats, -+ const struct gnet_stats_queue __percpu *cpu, -+ const struct gnet_stats_queue *q) - { - if (cpu) { -- __gnet_stats_copy_queue_cpu(qstats, cpu); -+ gnet_stats_add_queue_cpu(qstats, cpu); + if (tb[IFLA_VFINFO_LIST]) { +diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c +index 737e4f17e1c6..e57fdad9ef94 100644 +--- a/net/hsr/hsr_device.c ++++ b/net/hsr/hsr_device.c +@@ -30,13 +30,13 @@ static bool is_slave_up(struct net_device *dev) + + static void __hsr_set_operstate(struct net_device *dev, int transition) + { +- write_lock_bh(&dev_base_lock); ++ write_lock(&dev_base_lock); + if (dev->operstate != transition) { + dev->operstate = transition; +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); + netdev_state_change(dev); } else { -- qstats->qlen = q->qlen; -- qstats->backlog = q->backlog; -- qstats->drops = q->drops; -- qstats->requeues = q->requeues; -- qstats->overlimits = q->overlimits; -+ qstats->qlen += q->qlen; -+ qstats->backlog += q->backlog; -+ qstats->drops += q->drops; -+ qstats->requeues += q->requeues; -+ qstats->overlimits += q->overlimits; +- write_unlock_bh(&dev_base_lock); ++ write_unlock(&dev_base_lock); } -- -- qstats->qlen = qlen; } --EXPORT_SYMBOL(__gnet_stats_copy_queue); -+EXPORT_SYMBOL(gnet_stats_add_queue); - - /** - * gnet_stats_copy_queue - copy queue statistics into statistics TLV -@@ -339,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, - { - struct gnet_stats_queue qstats = {0}; -- __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); -+ gnet_stats_add_queue(&qstats, cpu_q, q); -+ qstats.qlen = qlen; - - if (d->compat_tc_stats) { - d->tc_stats.drops = qstats.drops; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c -index 75737267746f..e460c84b1f8e 100644 +index 75737267746f..7bd1e10086f0 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -637,7 +637,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) @@ -10219,7 +11403,7 @@ index 75737267746f..e460c84b1f8e 100644 return 0; } WARN_ON(!sk_unhashed(sk)); -@@ -669,11 +671,8 @@ int inet_hash(struct sock *sk) +@@ -669,45 +671,54 @@ int inet_hash(struct sock *sk) { int err = 0; @@ -10232,38 +11416,66 @@ index 75737267746f..e460c84b1f8e 100644 return err; } -@@ -684,17 +683,20 @@ void inet_unhash(struct sock *sk) - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct inet_listen_hashbucket *ilb = NULL; - spinlock_t *lock; -+ bool state_listen; + EXPORT_SYMBOL_GPL(inet_hash); +-void inet_unhash(struct sock *sk) ++static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) + { +- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; +- struct inet_listen_hashbucket *ilb = NULL; +- spinlock_t *lock; +- if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) { -+ state_listen = true; - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; +- if (sk->sk_state == TCP_LISTEN) { +- ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &ilb->lock; -+ spin_lock(&ilb->lock); - } else { -+ state_listen = false; - lock = inet_ehash_lockp(hashinfo, sk->sk_hash); -+ spin_lock_bh(lock); - } +- } else { +- lock = inet_ehash_lockp(hashinfo, sk->sk_hash); +- } - spin_lock_bh(lock); - if (sk_unhashed(sk)) - goto unlock; - -@@ -707,7 +709,10 @@ void inet_unhash(struct sock *sk) +- if (sk_unhashed(sk)) +- goto unlock; +- + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_stop_listen_sock(sk); + if (ilb) { ++ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; ++ + inet_unhash2(hashinfo, sk); + ilb->count--; + } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - unlock: +-unlock: - spin_unlock_bh(lock); -+ if (state_listen) ++} ++ ++void inet_unhash(struct sock *sk) ++{ ++ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; ++ ++ if (sk_unhashed(sk)) ++ return; ++ ++ if (sk->sk_state == TCP_LISTEN) { ++ struct inet_listen_hashbucket *ilb; ++ ++ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; ++ /* Don't disable bottom halves while acquiring the lock to ++ * avoid circular locking dependency on PREEMPT_RT. ++ */ ++ spin_lock(&ilb->lock); ++ __inet_unhash(sk, ilb); + spin_unlock(&ilb->lock); -+ else ++ } else { ++ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); ++ ++ spin_lock_bh(lock); ++ __inet_unhash(sk, NULL); + spin_unlock_bh(lock); ++ } } EXPORT_SYMBOL_GPL(inet_unhash); @@ -10284,1137 +11496,8 @@ index 67c9114835c8..0a2e7f228391 100644 return err; } -diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c -index 0d5c422f8745..8aec1b529364 100644 ---- a/net/netfilter/xt_RATEEST.c -+++ b/net/netfilter/xt_RATEEST.c -@@ -94,11 +94,11 @@ static unsigned int - xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) - { - const struct xt_rateest_target_info *info = par->targinfo; -- struct gnet_stats_basic_packed *stats = &info->est->bstats; -+ struct gnet_stats_basic_sync *stats = &info->est->bstats; - - spin_lock_bh(&info->est->lock); -- stats->bytes += skb->len; -- stats->packets++; -+ u64_stats_add(&stats->bytes, skb->len); -+ u64_stats_inc(&stats->packets); - spin_unlock_bh(&info->est->lock); - - return XT_CONTINUE; -@@ -143,6 +143,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) - if (!est) - goto err1; - -+ gnet_stats_basic_sync_init(&est->bstats); - strlcpy(est->name, info->name, sizeof(est->name)); - spin_lock_init(&est->lock); - est->refcnt = 1; -diff --git a/net/sched/act_api.c b/net/sched/act_api.c -index 7dd3a2dc5fa4..3258da3d5bed 100644 ---- a/net/sched/act_api.c -+++ b/net/sched/act_api.c -@@ -480,16 +480,18 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, - atomic_set(&p->tcfa_bindcnt, 1); - - if (cpustats) { -- p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); -+ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); - if (!p->cpu_bstats) - goto err1; -- p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); -+ p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); - if (!p->cpu_bstats_hw) - goto err2; - p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); - if (!p->cpu_qstats) - goto err3; - } -+ gnet_stats_basic_sync_init(&p->tcfa_bstats); -+ gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); - spin_lock_init(&p->tcfa_lock); - p->tcfa_index = index; - p->tcfa_tm.install = jiffies; -@@ -499,7 +501,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, - if (est) { - err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, - &p->tcfa_rate_est, -- &p->tcfa_lock, NULL, est); -+ &p->tcfa_lock, false, est); - if (err) - goto err4; - } -@@ -1126,13 +1128,13 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, - u64 drops, bool hw) - { - if (a->cpu_bstats) { -- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); -+ _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - - this_cpu_ptr(a->cpu_qstats)->drops += drops; - - if (hw) -- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), -- bytes, packets); -+ _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), -+ bytes, packets); - return; - } - -@@ -1171,9 +1173,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, - if (err < 0) - goto errout; - -- if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || -- gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, -- &p->tcfa_bstats_hw) < 0 || -+ if (gnet_stats_copy_basic(&d, p->cpu_bstats, -+ &p->tcfa_bstats, false) < 0 || -+ gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, -+ &p->tcfa_bstats_hw, false) < 0 || - gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || - gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfa_qstats, -diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c -index 5c36013339e1..f2bf896331a5 100644 ---- a/net/sched/act_bpf.c -+++ b/net/sched/act_bpf.c -@@ -41,7 +41,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, - int action, filter_res; - - tcf_lastuse_update(&prog->tcf_tm); -- bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - - filter = rcu_dereference(prog->filter); - if (at_ingress) { -diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c -index 7064a365a1a9..b757f90a2d58 100644 ---- a/net/sched/act_ife.c -+++ b/net/sched/act_ife.c -@@ -718,7 +718,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, - u8 *tlv_data; - u16 metalen; - -- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); - tcf_lastuse_update(&ife->tcf_tm); - - if (skb_at_tc_ingress(skb)) -@@ -806,7 +806,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, - exceed_mtu = true; - } - -- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); - tcf_lastuse_update(&ife->tcf_tm); - - if (!metalen) { /* no metadata to send */ -diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c -index e4529b428cf4..8faa4c58305e 100644 ---- a/net/sched/act_mpls.c -+++ b/net/sched/act_mpls.c -@@ -59,7 +59,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, - int ret, mac_len; - - tcf_lastuse_update(&m->tcf_tm); -- bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); - - /* Ensure 'data' points at mac_header prior calling mpls manipulating - * functions. -diff --git a/net/sched/act_police.c b/net/sched/act_police.c -index 832157a840fc..9e77ba8401e5 100644 ---- a/net/sched/act_police.c -+++ b/net/sched/act_police.c -@@ -125,7 +125,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, - police->common.cpu_bstats, - &police->tcf_rate_est, - &police->tcf_lock, -- NULL, est); -+ false, est); - if (err) - goto failure; - } else if (tb[TCA_POLICE_AVRATE] && -@@ -248,7 +248,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, - int ret; - - tcf_lastuse_update(&police->tcf_tm); -- bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); - - ret = READ_ONCE(police->tcf_action); - p = rcu_dereference_bh(police->params); -diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c -index 230501eb9e06..ce859b0e0deb 100644 ---- a/net/sched/act_sample.c -+++ b/net/sched/act_sample.c -@@ -163,7 +163,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, - int retval; - - tcf_lastuse_update(&s->tcf_tm); -- bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); - retval = READ_ONCE(s->tcf_action); - - psample_group = rcu_dereference_bh(s->psample_group); -diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c -index cbbe1861d3a2..e617ab4505ca 100644 ---- a/net/sched/act_simple.c -+++ b/net/sched/act_simple.c -@@ -36,7 +36,8 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, - * then it would look like "hello_3" (without quotes) - */ - pr_info("simple: %s_%llu\n", -- (char *)d->tcfd_defdata, d->tcf_bstats.packets); -+ (char *)d->tcfd_defdata, -+ u64_stats_read(&d->tcf_bstats.packets)); - spin_unlock(&d->tcf_lock); - return d->tcf_action; - } -diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c -index 605418538347..d30ecbfc8f84 100644 ---- a/net/sched/act_skbedit.c -+++ b/net/sched/act_skbedit.c -@@ -31,7 +31,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, - int action; - - tcf_lastuse_update(&d->tcf_tm); -- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); - - params = rcu_dereference_bh(d->params); - action = READ_ONCE(d->tcf_action); -diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c -index ecb9ee666095..9b6b52c5e24e 100644 ---- a/net/sched/act_skbmod.c -+++ b/net/sched/act_skbmod.c -@@ -31,7 +31,7 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, - u64 flags; - - tcf_lastuse_update(&d->tcf_tm); -- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); -+ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); - - action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) -diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c -index 12f39a2dffd4..ad0bdefb3205 100644 ---- a/net/sched/sch_api.c -+++ b/net/sched/sch_api.c -@@ -884,7 +884,7 @@ static void qdisc_offload_graft_root(struct net_device *dev, - static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 portid, u32 seq, u16 flags, int event) - { -- struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; -+ struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; - struct gnet_stats_queue __percpu *cpu_qstats = NULL; - struct tcmsg *tcm; - struct nlmsghdr *nlh; -@@ -942,8 +942,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - cpu_qstats = q->cpu_qstats; - } - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), -- &d, cpu_bstats, &q->bstats) < 0 || -+ if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || - gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || - gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) - goto nla_put_failure; -@@ -1264,26 +1263,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev, - rcu_assign_pointer(sch->stab, stab); - } - if (tca[TCA_RATE]) { -- seqcount_t *running; -- - err = -EOPNOTSUPP; - if (sch->flags & TCQ_F_MQROOT) { - NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); - goto err_out4; - } - -- if (sch->parent != TC_H_ROOT && -- !(sch->flags & TCQ_F_INGRESS) && -- (!p || !(p->flags & TCQ_F_MQROOT))) -- running = qdisc_root_sleeping_running(sch); -- else -- running = &sch->running; -- - err = gen_new_estimator(&sch->bstats, - sch->cpu_bstats, - &sch->rate_est, - NULL, -- running, -+ true, - tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); -@@ -1359,7 +1349,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, - sch->cpu_bstats, - &sch->rate_est, - NULL, -- qdisc_root_sleeping_running(sch), -+ true, - tca[TCA_RATE]); - } - out: -diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c -index 7d8518176b45..4c8e994cf0a5 100644 ---- a/net/sched/sch_atm.c -+++ b/net/sched/sch_atm.c -@@ -52,7 +52,7 @@ struct atm_flow_data { - struct atm_qdisc_data *parent; /* parent qdisc */ - struct socket *sock; /* for closing */ - int ref; /* reference count */ -- struct gnet_stats_basic_packed bstats; -+ struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct list_head list; - struct atm_flow_data *excess; /* flow for excess traffic; -@@ -548,6 +548,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, - pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); - INIT_LIST_HEAD(&p->flows); - INIT_LIST_HEAD(&p->link.list); -+ gnet_stats_basic_sync_init(&p->link.bstats); - list_add(&p->link.list, &p->flows); - p->link.q = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, sch->handle, extack); -@@ -652,8 +653,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, - { - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, NULL, &flow->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || - gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) - return -1; - -diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c -index e0da15530f0e..02d9f0dfe356 100644 ---- a/net/sched/sch_cbq.c -+++ b/net/sched/sch_cbq.c -@@ -116,7 +116,7 @@ struct cbq_class { - long avgidle; - long deficit; /* Saved deficit for WRR */ - psched_time_t penalized; -- struct gnet_stats_basic_packed bstats; -+ struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct net_rate_estimator __rcu *rate_est; - struct tc_cbq_xstats xstats; -@@ -565,8 +565,7 @@ cbq_update(struct cbq_sched_data *q) - long avgidle = cl->avgidle; - long idle; - -- cl->bstats.packets++; -- cl->bstats.bytes += len; -+ _bstats_update(&cl->bstats, len, 1); - - /* - * (now - last) is total time between packet right edges. -@@ -1384,8 +1383,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, - if (cl->undertime != PSCHED_PASTPERFECT) - cl->xstats.undertime = cl->undertime - q->now; - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, NULL, &cl->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) - return -1; -@@ -1519,7 +1517,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, -- qdisc_root_sleeping_running(sch), -+ true, - tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); -@@ -1611,6 +1609,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t - if (cl == NULL) - goto failure; - -+ gnet_stats_basic_sync_init(&cl->bstats); - err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); - if (err) { - kfree(cl); -@@ -1619,9 +1618,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t - - if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, -- NULL, -- qdisc_root_sleeping_running(sch), -- tca[TCA_RATE]); -+ NULL, true, tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); - tcf_block_put(cl->block); -diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c -index 642cd179b7a7..18e4f7a0b291 100644 ---- a/net/sched/sch_drr.c -+++ b/net/sched/sch_drr.c -@@ -19,7 +19,7 @@ struct drr_class { - struct Qdisc_class_common common; - unsigned int filter_cnt; - -- struct gnet_stats_basic_packed bstats; -+ struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct net_rate_estimator __rcu *rate_est; - struct list_head alist; -@@ -85,8 +85,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, -- NULL, -- qdisc_root_sleeping_running(sch), -+ NULL, true, - tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Failed to replace estimator"); -@@ -106,6 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - if (cl == NULL) - return -ENOBUFS; - -+ gnet_stats_basic_sync_init(&cl->bstats); - cl->common.classid = classid; - cl->quantum = quantum; - cl->qdisc = qdisc_create_dflt(sch->dev_queue, -@@ -118,9 +118,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - - if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, -- NULL, -- qdisc_root_sleeping_running(sch), -- tca[TCA_RATE]); -+ NULL, true, tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Failed to replace estimator"); - qdisc_put(cl->qdisc); -@@ -267,8 +265,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, - if (qlen) - xstats.deficit = cl->deficit; - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, NULL, &cl->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) - return -1; -diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c -index 92a686807971..e007fc75ef2f 100644 ---- a/net/sched/sch_ets.c -+++ b/net/sched/sch_ets.c -@@ -41,7 +41,7 @@ struct ets_class { - struct Qdisc *qdisc; - u32 quantum; - u32 deficit; -- struct gnet_stats_basic_packed bstats; -+ struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - }; - -@@ -325,8 +325,7 @@ static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, - struct ets_class *cl = ets_class_from_arg(sch, arg); - struct Qdisc *cl_q = cl->qdisc; - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, NULL, &cl_q->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || - qdisc_qstats_copy(d, cl_q) < 0) - return -1; - -@@ -661,7 +660,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, - - q->nbands = nbands; - for (i = nstrict; i < q->nstrict; i++) { -- INIT_LIST_HEAD(&q->classes[i].alist); - if (q->classes[i].qdisc->q.qlen) { - list_add_tail(&q->classes[i].alist, &q->active); - q->classes[i].deficit = quanta[i]; -@@ -689,7 +687,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, - ets_offload_change(sch); - for (i = q->nbands; i < oldbands; i++) { - qdisc_put(q->classes[i].qdisc); -- memset(&q->classes[i], 0, sizeof(q->classes[i])); -+ q->classes[i].qdisc = NULL; -+ q->classes[i].quantum = 0; -+ q->classes[i].deficit = 0; -+ gnet_stats_basic_sync_init(&q->classes[i].bstats); -+ memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); - } - return 0; - } -@@ -698,7 +700,7 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) - { - struct ets_sched *q = qdisc_priv(sch); -- int err; -+ int err, i; - - if (!opt) - return -EINVAL; -@@ -708,6 +710,9 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, - return err; - - INIT_LIST_HEAD(&q->active); -+ for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) -+ INIT_LIST_HEAD(&q->classes[i].alist); -+ - return ets_qdisc_change(sch, opt, extack); - } - -diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c -index 66d2fbe9ef50..2b9c1a42dca8 100644 ---- a/net/sched/sch_generic.c -+++ b/net/sched/sch_generic.c -@@ -304,8 +304,8 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, - - /* - * Transmit possibly several skbs, and handle the return status as -- * required. Owning running seqcount bit guarantees that -- * only one CPU can execute this function. -+ * required. Owning qdisc running bit guarantees that only one CPU -+ * can execute this function. - * - * Returns to the caller: - * false - hardware queue frozen backoff -@@ -606,7 +606,6 @@ struct Qdisc noop_qdisc = { - .ops = &noop_qdisc_ops, - .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), - .dev_queue = &noop_netdev_queue, -- .running = SEQCNT_ZERO(noop_qdisc.running), - .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), - .gso_skb = { - .next = (struct sk_buff *)&noop_qdisc.gso_skb, -@@ -867,7 +866,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { - EXPORT_SYMBOL(pfifo_fast_ops); - - static struct lock_class_key qdisc_tx_busylock; --static struct lock_class_key qdisc_running_key; - - struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - const struct Qdisc_ops *ops, -@@ -892,11 +890,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - __skb_queue_head_init(&sch->gso_skb); - __skb_queue_head_init(&sch->skb_bad_txq); - qdisc_skb_head_init(&sch->q); -+ gnet_stats_basic_sync_init(&sch->bstats); - spin_lock_init(&sch->q.lock); - - if (ops->static_flags & TCQ_F_CPUSTATS) { - sch->cpu_bstats = -- netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); -+ netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); - if (!sch->cpu_bstats) - goto errout1; - -@@ -916,10 +915,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - lockdep_set_class(&sch->seqlock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - -- seqcount_init(&sch->running); -- lockdep_set_class(&sch->running, -- dev->qdisc_running_key ?: &qdisc_running_key); -- - sch->ops = ops; - sch->flags = ops->static_flags; - sch->enqueue = ops->enqueue; -diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c -index 621dc6afde8f..1073c76d05c4 100644 ---- a/net/sched/sch_gred.c -+++ b/net/sched/sch_gred.c -@@ -56,6 +56,7 @@ struct gred_sched { - u32 DPs; - u32 def; - struct red_vars wred_set; -+ struct tc_gred_qopt_offload *opt; - }; - - static inline int gred_wred_mode(struct gred_sched *table) -@@ -311,48 +312,50 @@ static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) - { - struct gred_sched *table = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); -- struct tc_gred_qopt_offload opt = { -- .command = command, -- .handle = sch->handle, -- .parent = sch->parent, -- }; -+ struct tc_gred_qopt_offload *opt = table->opt; - - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) - return; - -+ memset(opt, 0, sizeof(*opt)); -+ opt->command = command; -+ opt->handle = sch->handle; -+ opt->parent = sch->parent; -+ - if (command == TC_GRED_REPLACE) { - unsigned int i; - -- opt.set.grio_on = gred_rio_mode(table); -- opt.set.wred_on = gred_wred_mode(table); -- opt.set.dp_cnt = table->DPs; -- opt.set.dp_def = table->def; -+ opt->set.grio_on = gred_rio_mode(table); -+ opt->set.wred_on = gred_wred_mode(table); -+ opt->set.dp_cnt = table->DPs; -+ opt->set.dp_def = table->def; - - for (i = 0; i < table->DPs; i++) { - struct gred_sched_data *q = table->tab[i]; - - if (!q) - continue; -- opt.set.tab[i].present = true; -- opt.set.tab[i].limit = q->limit; -- opt.set.tab[i].prio = q->prio; -- opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; -- opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; -- opt.set.tab[i].is_ecn = gred_use_ecn(q); -- opt.set.tab[i].is_harddrop = gred_use_harddrop(q); -- opt.set.tab[i].probability = q->parms.max_P; -- opt.set.tab[i].backlog = &q->backlog; -+ opt->set.tab[i].present = true; -+ opt->set.tab[i].limit = q->limit; -+ opt->set.tab[i].prio = q->prio; -+ opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; -+ opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; -+ opt->set.tab[i].is_ecn = gred_use_ecn(q); -+ opt->set.tab[i].is_harddrop = gred_use_harddrop(q); -+ opt->set.tab[i].probability = q->parms.max_P; -+ opt->set.tab[i].backlog = &q->backlog; - } -- opt.set.qstats = &sch->qstats; -+ opt->set.qstats = &sch->qstats; - } - -- dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); -+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); - } - - static int gred_offload_dump_stats(struct Qdisc *sch) - { - struct gred_sched *table = qdisc_priv(sch); - struct tc_gred_qopt_offload *hw_stats; -+ u64 bytes = 0, packets = 0; - unsigned int i; - int ret; - -@@ -364,9 +367,11 @@ static int gred_offload_dump_stats(struct Qdisc *sch) - hw_stats->handle = sch->handle; - hw_stats->parent = sch->parent; - -- for (i = 0; i < MAX_DPs; i++) -+ for (i = 0; i < MAX_DPs; i++) { -+ gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); - if (table->tab[i]) - hw_stats->stats.xstats[i] = &table->tab[i]->stats; -+ } - - ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); - /* Even if driver returns failure adjust the stats - in case offload -@@ -375,19 +380,19 @@ static int gred_offload_dump_stats(struct Qdisc *sch) - for (i = 0; i < MAX_DPs; i++) { - if (!table->tab[i]) - continue; -- table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; -- table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; -+ table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); -+ table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); - table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; - -- _bstats_update(&sch->bstats, -- hw_stats->stats.bstats[i].bytes, -- hw_stats->stats.bstats[i].packets); -+ bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); -+ packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); - sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; - sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; - sch->qstats.drops += hw_stats->stats.qstats[i].drops; - sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; - sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; - } -+ _bstats_update(&sch->bstats, bytes, packets); - - kfree(hw_stats); - return ret; -@@ -728,6 +733,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, - static int gred_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) - { -+ struct gred_sched *table = qdisc_priv(sch); - struct nlattr *tb[TCA_GRED_MAX + 1]; - int err; - -@@ -751,6 +757,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, - sch->limit = qdisc_dev(sch)->tx_queue_len - * psched_mtu(qdisc_dev(sch)); - -+ if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { -+ table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); -+ if (!table->opt) -+ return -ENOMEM; -+ } -+ - return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); - } - -@@ -907,6 +919,7 @@ static void gred_destroy(struct Qdisc *sch) - gred_destroy_vq(table->tab[i]); - } - gred_offload(sch, TC_GRED_DESTROY); -+ kfree(table->opt); - } - - static struct Qdisc_ops gred_qdisc_ops __read_mostly = { -diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c -index b7ac30cca035..d3979a6000e7 100644 ---- a/net/sched/sch_hfsc.c -+++ b/net/sched/sch_hfsc.c -@@ -111,7 +111,7 @@ enum hfsc_class_flags { - struct hfsc_class { - struct Qdisc_class_common cl_common; - -- struct gnet_stats_basic_packed bstats; -+ struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct net_rate_estimator __rcu *rate_est; - struct tcf_proto __rcu *filter_list; /* filter list */ -@@ -965,7 +965,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, -- qdisc_root_sleeping_running(sch), -+ true, - tca[TCA_RATE]); - if (err) - return err; -@@ -1033,9 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - - if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, -- NULL, -- qdisc_root_sleeping_running(sch), -- tca[TCA_RATE]); -+ NULL, true, tca[TCA_RATE]); - if (err) { - tcf_block_put(cl->block); - kfree(cl); -@@ -1328,7 +1326,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, - xstats.work = cl->cl_total; - xstats.rtwork = cl->cl_cumul; - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) - return -1; -@@ -1406,6 +1404,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, - if (err) - return err; - -+ gnet_stats_basic_sync_init(&q->root.bstats); - q->root.cl_common.classid = sch->handle; - q->root.sched = q; - q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, -diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c -index 5067a6e5d4fd..cf1d45db4e84 100644 ---- a/net/sched/sch_htb.c -+++ b/net/sched/sch_htb.c -@@ -113,8 +113,8 @@ struct htb_class { - /* - * Written often fields - */ -- struct gnet_stats_basic_packed bstats; -- struct gnet_stats_basic_packed bstats_bias; -+ struct gnet_stats_basic_sync bstats; -+ struct gnet_stats_basic_sync bstats_bias; - struct tc_htb_xstats xstats; /* our special stats */ - - /* token bucket parameters */ -@@ -1308,10 +1308,11 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, - static void htb_offload_aggregate_stats(struct htb_sched *q, - struct htb_class *cl) - { -+ u64 bytes = 0, packets = 0; - struct htb_class *c; - unsigned int i; - -- memset(&cl->bstats, 0, sizeof(cl->bstats)); -+ gnet_stats_basic_sync_init(&cl->bstats); - - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { -@@ -1323,14 +1324,15 @@ static void htb_offload_aggregate_stats(struct htb_sched *q, - if (p != cl) - continue; - -- cl->bstats.bytes += c->bstats_bias.bytes; -- cl->bstats.packets += c->bstats_bias.packets; -+ bytes += u64_stats_read(&c->bstats_bias.bytes); -+ packets += u64_stats_read(&c->bstats_bias.packets); - if (c->level == 0) { -- cl->bstats.bytes += c->leaf.q->bstats.bytes; -- cl->bstats.packets += c->leaf.q->bstats.packets; -+ bytes += u64_stats_read(&c->leaf.q->bstats.bytes); -+ packets += u64_stats_read(&c->leaf.q->bstats.packets); - } - } - } -+ _bstats_update(&cl->bstats, bytes, packets); - } - - static int -@@ -1357,16 +1359,16 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) - if (cl->leaf.q) - cl->bstats = cl->leaf.q->bstats; - else -- memset(&cl->bstats, 0, sizeof(cl->bstats)); -- cl->bstats.bytes += cl->bstats_bias.bytes; -- cl->bstats.packets += cl->bstats_bias.packets; -+ gnet_stats_basic_sync_init(&cl->bstats); -+ _bstats_update(&cl->bstats, -+ u64_stats_read(&cl->bstats_bias.bytes), -+ u64_stats_read(&cl->bstats_bias.packets)); - } else { - htb_offload_aggregate_stats(q, cl); - } - } - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, NULL, &cl->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) - return -1; -@@ -1578,8 +1580,9 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, - WARN_ON(old != q); - - if (cl->parent) { -- cl->parent->bstats_bias.bytes += q->bstats.bytes; -- cl->parent->bstats_bias.packets += q->bstats.packets; -+ _bstats_update(&cl->parent->bstats_bias, -+ u64_stats_read(&q->bstats.bytes), -+ u64_stats_read(&q->bstats.packets)); - } - - offload_opt = (struct tc_htb_qopt_offload) { -@@ -1849,6 +1852,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, - if (!cl) - goto failure; - -+ gnet_stats_basic_sync_init(&cl->bstats); -+ gnet_stats_basic_sync_init(&cl->bstats_bias); -+ - err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); - if (err) { - kfree(cl); -@@ -1858,7 +1864,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, - err = gen_new_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, -- qdisc_root_sleeping_running(sch), -+ true, - tca[TCA_RATE] ? : &est.nla); - if (err) - goto err_block_put; -@@ -1922,8 +1928,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, - htb_graft_helper(dev_queue, old_q); - goto err_kill_estimator; - } -- parent->bstats_bias.bytes += old_q->bstats.bytes; -- parent->bstats_bias.packets += old_q->bstats.packets; -+ _bstats_update(&parent->bstats_bias, -+ u64_stats_read(&old_q->bstats.bytes), -+ u64_stats_read(&old_q->bstats.packets)); - qdisc_put(old_q); - } - new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, -@@ -1983,7 +1990,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, -- qdisc_root_sleeping_running(sch), -+ true, - tca[TCA_RATE]); - if (err) - return err; -diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c -index db18d8a860f9..24c5d97d88dd 100644 ---- a/net/sched/sch_mq.c -+++ b/net/sched/sch_mq.c -@@ -153,10 +153,9 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) - struct net_device *dev = qdisc_dev(sch); - struct Qdisc *qdisc; - unsigned int ntx; -- __u32 qlen = 0; - - sch->q.qlen = 0; -- memset(&sch->bstats, 0, sizeof(sch->bstats)); -+ gnet_stats_basic_sync_init(&sch->bstats); - memset(&sch->qstats, 0, sizeof(sch->qstats)); - - /* MQ supports lockless qdiscs. However, statistics accounting needs -@@ -168,25 +167,11 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) - qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; - spin_lock_bh(qdisc_lock(qdisc)); - -- if (qdisc_is_percpu_stats(qdisc)) { -- qlen = qdisc_qlen_sum(qdisc); -- __gnet_stats_copy_basic(NULL, &sch->bstats, -- qdisc->cpu_bstats, -- &qdisc->bstats); -- __gnet_stats_copy_queue(&sch->qstats, -- qdisc->cpu_qstats, -- &qdisc->qstats, qlen); -- sch->q.qlen += qlen; -- } else { -- sch->q.qlen += qdisc->q.qlen; -- sch->bstats.bytes += qdisc->bstats.bytes; -- sch->bstats.packets += qdisc->bstats.packets; -- sch->qstats.qlen += qdisc->qstats.qlen; -- sch->qstats.backlog += qdisc->qstats.backlog; -- sch->qstats.drops += qdisc->qstats.drops; -- sch->qstats.requeues += qdisc->qstats.requeues; -- sch->qstats.overlimits += qdisc->qstats.overlimits; -- } -+ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, -+ &qdisc->bstats, false); -+ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, -+ &qdisc->qstats); -+ sch->q.qlen += qdisc_qlen(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - } -@@ -269,8 +254,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, - struct netdev_queue *dev_queue = mq_queue_get(sch, cl); - - sch = dev_queue->qdisc_sleeping; -- if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, -- &sch->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || - qdisc_qstats_copy(d, sch) < 0) - return -1; - return 0; -diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c -index 50e15add6068..42d4101e4f3d 100644 ---- a/net/sched/sch_mqprio.c -+++ b/net/sched/sch_mqprio.c -@@ -412,7 +412,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) - unsigned int ntx, tc; - - sch->q.qlen = 0; -- memset(&sch->bstats, 0, sizeof(sch->bstats)); -+ gnet_stats_basic_sync_init(&sch->bstats); - memset(&sch->qstats, 0, sizeof(sch->qstats)); - - /* MQ supports lockless qdiscs. However, statistics accounting needs -@@ -424,25 +424,11 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) - qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; - spin_lock_bh(qdisc_lock(qdisc)); - -- if (qdisc_is_percpu_stats(qdisc)) { -- __u32 qlen = qdisc_qlen_sum(qdisc); -- -- __gnet_stats_copy_basic(NULL, &sch->bstats, -- qdisc->cpu_bstats, -- &qdisc->bstats); -- __gnet_stats_copy_queue(&sch->qstats, -- qdisc->cpu_qstats, -- &qdisc->qstats, qlen); -- sch->q.qlen += qlen; -- } else { -- sch->q.qlen += qdisc->q.qlen; -- sch->bstats.bytes += qdisc->bstats.bytes; -- sch->bstats.packets += qdisc->bstats.packets; -- sch->qstats.backlog += qdisc->qstats.backlog; -- sch->qstats.drops += qdisc->qstats.drops; -- sch->qstats.requeues += qdisc->qstats.requeues; -- sch->qstats.overlimits += qdisc->qstats.overlimits; -- } -+ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, -+ &qdisc->bstats, false); -+ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, -+ &qdisc->qstats); -+ sch->q.qlen += qdisc_qlen(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - } -@@ -534,12 +520,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, - { - if (cl >= TC_H_MIN_PRIORITY) { - int i; -- __u32 qlen = 0; -+ __u32 qlen; - struct gnet_stats_queue qstats = {0}; -- struct gnet_stats_basic_packed bstats = {0}; -+ struct gnet_stats_basic_sync bstats; - struct net_device *dev = qdisc_dev(sch); - struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; - -+ gnet_stats_basic_sync_init(&bstats); - /* Drop lock here it will be reclaimed before touching - * statistics this is required because the d->lock we - * hold here is the look on dev_queue->qdisc_sleeping -@@ -554,40 +541,28 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, - - spin_lock_bh(qdisc_lock(qdisc)); - -- if (qdisc_is_percpu_stats(qdisc)) { -- qlen = qdisc_qlen_sum(qdisc); -- -- __gnet_stats_copy_basic(NULL, &bstats, -- qdisc->cpu_bstats, -- &qdisc->bstats); -- __gnet_stats_copy_queue(&qstats, -- qdisc->cpu_qstats, -- &qdisc->qstats, -- qlen); -- } else { -- qlen += qdisc->q.qlen; -- bstats.bytes += qdisc->bstats.bytes; -- bstats.packets += qdisc->bstats.packets; -- qstats.backlog += qdisc->qstats.backlog; -- qstats.drops += qdisc->qstats.drops; -- qstats.requeues += qdisc->qstats.requeues; -- qstats.overlimits += qdisc->qstats.overlimits; -- } -+ gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, -+ &qdisc->bstats, false); -+ gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, -+ &qdisc->qstats); -+ sch->q.qlen += qdisc_qlen(qdisc); -+ - spin_unlock_bh(qdisc_lock(qdisc)); - } -+ qlen = qdisc_qlen(sch) + qstats.qlen; - - /* Reclaim root sleeping lock before completing stats */ - if (d->lock) - spin_lock_bh(d->lock); -- if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || - gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) - return -1; - } else { - struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); - - sch = dev_queue->qdisc_sleeping; -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, -- sch->cpu_bstats, &sch->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, sch->cpu_bstats, -+ &sch->bstats, true) < 0 || - qdisc_qstats_copy(d, sch) < 0) - return -1; - } -diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c -index e282e7382117..cd8ab90c4765 100644 ---- a/net/sched/sch_multiq.c -+++ b/net/sched/sch_multiq.c -@@ -338,8 +338,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, - struct Qdisc *cl_q; - - cl_q = q->queues[cl - 1]; -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || - qdisc_qstats_copy(d, cl_q) < 0) - return -1; - -diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c -index 03fdf31ccb6a..3b8d7197c06b 100644 ---- a/net/sched/sch_prio.c -+++ b/net/sched/sch_prio.c -@@ -361,8 +361,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, - struct Qdisc *cl_q; - - cl_q = q->queues[cl - 1]; -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, -+ &cl_q->bstats, true) < 0 || - qdisc_qstats_copy(d, cl_q) < 0) - return -1; - -diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c -index 58a9d42b52b8..0b7f9ba28deb 100644 ---- a/net/sched/sch_qfq.c -+++ b/net/sched/sch_qfq.c -@@ -131,7 +131,7 @@ struct qfq_class { - - unsigned int filter_cnt; - -- struct gnet_stats_basic_packed bstats; -+ struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct net_rate_estimator __rcu *rate_est; - struct Qdisc *qdisc; -@@ -451,7 +451,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, -- qdisc_root_sleeping_running(sch), -+ true, - tca[TCA_RATE]); - if (err) - return err; -@@ -465,6 +465,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - if (cl == NULL) - return -ENOBUFS; - -+ gnet_stats_basic_sync_init(&cl->bstats); - cl->common.classid = classid; - cl->deficit = lmax; - -@@ -477,7 +478,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - err = gen_new_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, -- qdisc_root_sleeping_running(sch), -+ true, - tca[TCA_RATE]); - if (err) - goto destroy_class; -@@ -639,8 +640,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, - xstats.weight = cl->agg->class_weight; - xstats.lmax = cl->agg->lmax; - -- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), -- d, NULL, &cl->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - qdisc_qstats_copy(d, cl->qdisc) < 0) - return -1; -@@ -1234,8 +1234,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, - return err; - } - -- cl->bstats.bytes += len; -- cl->bstats.packets += gso_segs; -+ _bstats_update(&cl->bstats, len, gso_segs); - sch->qstats.backlog += len; - ++sch->q.qlen; - -diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c -index a66398fb2d6d..377f896bdedc 100644 ---- a/net/sched/sch_taprio.c -+++ b/net/sched/sch_taprio.c -@@ -1984,7 +1984,7 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); - - sch = dev_queue->qdisc_sleeping; -- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || -+ if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || - qdisc_qstats_copy(d, sch) < 0) - return -1; - return 0; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c -index 6316bd2b8f37..dfc9d12caef8 100644 +index 1e99ba1b9d72..9b20e4d6bfe4 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -441,7 +441,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) @@ -11435,202 +11518,6 @@ index 6316bd2b8f37..dfc9d12caef8 100644 trace_svc_xprt_do_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); -diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c -index 5a90aa527877..642d0748c169 100644 ---- a/samples/kfifo/bytestream-example.c -+++ b/samples/kfifo/bytestream-example.c -@@ -22,10 +22,10 @@ - #define PROC_FIFO "bytestream-fifo" - - /* lock for procfs read access */ --static DEFINE_MUTEX(read_lock); -+static DEFINE_MUTEX(read_access); - - /* lock for procfs write access */ --static DEFINE_MUTEX(write_lock); -+static DEFINE_MUTEX(write_access); - - /* - * define DYNAMIC in this example for a dynamically allocated fifo. -@@ -116,12 +116,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, - int ret; - unsigned int copied; - -- if (mutex_lock_interruptible(&write_lock)) -+ if (mutex_lock_interruptible(&write_access)) - return -ERESTARTSYS; - - ret = kfifo_from_user(&test, buf, count, &copied); - -- mutex_unlock(&write_lock); -+ mutex_unlock(&write_access); - if (ret) - return ret; - -@@ -134,12 +134,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, - int ret; - unsigned int copied; - -- if (mutex_lock_interruptible(&read_lock)) -+ if (mutex_lock_interruptible(&read_access)) - return -ERESTARTSYS; - - ret = kfifo_to_user(&test, buf, count, &copied); - -- mutex_unlock(&read_lock); -+ mutex_unlock(&read_access); - if (ret) - return ret; - -diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c -index e5403d8c971a..c61482ba94f4 100644 ---- a/samples/kfifo/inttype-example.c -+++ b/samples/kfifo/inttype-example.c -@@ -22,10 +22,10 @@ - #define PROC_FIFO "int-fifo" - - /* lock for procfs read access */ --static DEFINE_MUTEX(read_lock); -+static DEFINE_MUTEX(read_access); - - /* lock for procfs write access */ --static DEFINE_MUTEX(write_lock); -+static DEFINE_MUTEX(write_access); - - /* - * define DYNAMIC in this example for a dynamically allocated fifo. -@@ -109,12 +109,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, - int ret; - unsigned int copied; - -- if (mutex_lock_interruptible(&write_lock)) -+ if (mutex_lock_interruptible(&write_access)) - return -ERESTARTSYS; - - ret = kfifo_from_user(&test, buf, count, &copied); - -- mutex_unlock(&write_lock); -+ mutex_unlock(&write_access); - if (ret) - return ret; - -@@ -127,12 +127,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, - int ret; - unsigned int copied; - -- if (mutex_lock_interruptible(&read_lock)) -+ if (mutex_lock_interruptible(&read_access)) - return -ERESTARTSYS; - - ret = kfifo_to_user(&test, buf, count, &copied); - -- mutex_unlock(&read_lock); -+ mutex_unlock(&read_access); - if (ret) - return ret; - -diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c -index f64f3d62d6c2..e4087b2d3fc4 100644 ---- a/samples/kfifo/record-example.c -+++ b/samples/kfifo/record-example.c -@@ -22,10 +22,10 @@ - #define PROC_FIFO "record-fifo" - - /* lock for procfs read access */ --static DEFINE_MUTEX(read_lock); -+static DEFINE_MUTEX(read_access); - - /* lock for procfs write access */ --static DEFINE_MUTEX(write_lock); -+static DEFINE_MUTEX(write_access); - - /* - * define DYNAMIC in this example for a dynamically allocated fifo. -@@ -123,12 +123,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, - int ret; - unsigned int copied; - -- if (mutex_lock_interruptible(&write_lock)) -+ if (mutex_lock_interruptible(&write_access)) - return -ERESTARTSYS; - - ret = kfifo_from_user(&test, buf, count, &copied); - -- mutex_unlock(&write_lock); -+ mutex_unlock(&write_access); - if (ret) - return ret; - -@@ -141,12 +141,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, - int ret; - unsigned int copied; - -- if (mutex_lock_interruptible(&read_lock)) -+ if (mutex_lock_interruptible(&read_access)) - return -ERESTARTSYS; - - ret = kfifo_to_user(&test, buf, count, &copied); - -- mutex_unlock(&read_lock); -+ mutex_unlock(&read_access); - if (ret) - return ret; - -diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c -index 21a0e7c3b8de..e7dd316da551 100644 ---- a/security/smack/smack_lsm.c -+++ b/security/smack/smack_lsm.c -@@ -51,8 +51,10 @@ - #define SMK_RECEIVING 1 - #define SMK_SENDING 2 - -+#ifdef SMACK_IPV6_PORT_LABELING - static DEFINE_MUTEX(smack_ipv6_lock); - static LIST_HEAD(smk_ipv6_port_list); -+#endif - struct kmem_cache *smack_rule_cache; - int smack_enabled __initdata; - -@@ -2603,7 +2605,6 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) - mutex_unlock(&smack_ipv6_lock); - return; - } --#endif - - /** - * smk_ipv6_port_check - check Smack port access -@@ -2666,6 +2667,7 @@ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, - - return smk_ipv6_check(skp, object, address, act); - } -+#endif - - /** - * smack_inode_setsecurity - set smack xattrs -@@ -2852,8 +2854,9 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, - rc = smk_ipv6_check(ssp->smk_out, rsp, sip, - SMK_CONNECTING); - } -- if (__is_defined(SMACK_IPV6_PORT_LABELING)) -- rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); -+#ifdef SMACK_IPV6_PORT_LABELING -+ rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); -+#endif - - return rc; - } -diff --git a/sound/soc/mediatek/common/mtk-afe-fe-dai.c b/sound/soc/mediatek/common/mtk-afe-fe-dai.c -index e95c7c018e7d..4f2c2379531b 100644 ---- a/sound/soc/mediatek/common/mtk-afe-fe-dai.c -+++ b/sound/soc/mediatek/common/mtk-afe-fe-dai.c -@@ -288,7 +288,6 @@ const struct snd_soc_dai_ops mtk_afe_fe_ops = { - }; - EXPORT_SYMBOL_GPL(mtk_afe_fe_ops); - --static DEFINE_MUTEX(irqs_lock); - int mtk_dynamic_irq_acquire(struct mtk_base_afe *afe) - { - int i; -- 2.30.2 diff --git a/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-Device-Tree-Changes.patch b/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-Device-Tree-Changes.patch index ff017cbf7..38e97090a 100644 --- a/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-Device-Tree-Changes.patch +++ b/patches/soc/ti/beagleboard_dtbs/0001-Add-BeagleBoard.org-Device-Tree-Changes.patch @@ -1,6 +1,6 @@ -From 347366afcfb36f7ba0b5f0d4cb1a55f81bb8439e Mon Sep 17 00:00:00 2001 +From 3be1a61a6e5ce784c29e0b89e4053ba6494d86a8 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Mon, 10 Jan 2022 15:17:52 -0600 +Date: Tue, 25 Jan 2022 09:33:07 -0600 Subject: [PATCH] Add BeagleBoard.org Device Tree Changes https://github.com/beagleboard/BeagleBoard-DeviceTrees/tree/v5.16.x diff --git a/patches/wireless_regdb/0001-Add-wireless-regdb-regulatory-database-file.patch b/patches/wireless_regdb/0001-Add-wireless-regdb-regulatory-database-file.patch index 9489224f7..5091d4d17 100644 --- a/patches/wireless_regdb/0001-Add-wireless-regdb-regulatory-database-file.patch +++ b/patches/wireless_regdb/0001-Add-wireless-regdb-regulatory-database-file.patch @@ -1,6 +1,6 @@ -From 34d1cf39e5943f8840730ece14b337c94b8dd455 Mon Sep 17 00:00:00 2001 +From 4b48e2028eaf39c5088b0ba2ce8c910d897314dc Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Mon, 10 Jan 2022 15:17:27 -0600 +Date: Tue, 25 Jan 2022 09:32:42 -0600 Subject: [PATCH] Add wireless-regdb regulatory database file https://git.kernel.org/pub/scm/linux/kernel/git/sforshee/wireless-regdb.git/commit/?id=2ce78ed90f71955f7b223c17b5cda6c8a7708efe diff --git a/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch b/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch index d5f84dce0..adcc2aabf 100644 --- a/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch +++ b/patches/wpanusb/0001-merge-wpanusb-https-github.com-statropy-wpanusb.patch @@ -1,6 +1,6 @@ -From f4ef195954278e9107b9a7cd114991daee91b22b Mon Sep 17 00:00:00 2001 +From 0381e77e70e9a9714bcdc5da3b8e5d43bc0d0246 Mon Sep 17 00:00:00 2001 From: Robert Nelson <robertcnelson@gmail.com> -Date: Mon, 10 Jan 2022 15:00:23 -0600 +Date: Tue, 25 Jan 2022 09:29:52 -0600 Subject: [PATCH] merge: wpanusb: https://github.com/statropy/wpanusb https://github.com/statropy/wpanusb/commit/251f0167545bf2dcaa3cad991a59dbf5ab05490a diff --git a/version.sh b/version.sh index 2331d1556..72b09d456 100644 --- a/version.sh +++ b/version.sh @@ -38,10 +38,10 @@ toolchain="gcc_11_arm" #Kernel KERNEL_REL=5.16 -KERNEL_TAG=${KERNEL_REL} -kernel_rt=".X-rtY" +KERNEL_TAG=${KERNEL_REL}.2 +kernel_rt=".2-rt19" #Kernel Build -BUILD=${build_prefix}10 +BUILD=${build_prefix}10.1 #v5.X-rcX + upto SHA #prev_KERNEL_SHA="" -- GitLab