Patches contributed by Eötvös Lorand University
commit c68ed7945701a38f2121ed74e23ff19c2052b4c2
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Sep 8 15:17:57 2021 -0700
mm/vmstat: protect per cpu variables with preempt disable on RT
Disable preemption on -RT for the vmstat code. On vanila the code runs in
IRQ-off regions while on -RT it may not when stats are updated under a
local_lock. "preempt_disable" ensures that the same resources is not
updated in parallel due to preemption.
This patch differs from the preempt-rt version where __count_vm_event and
__count_vm_events are also protected. The counters are explicitly
"allowed to be to be racy" so there is no need to protect them from
preemption. Only the accurate page stats that are updated by a
read-modify-write need protection. This patch also differs in that a
preempt_[en|dis]able_rt helper is not used. As vmstat is the only user of
the helper, it was suggested that it be open-coded in vmstat.c instead of
risking the helper being used in unnecessary contexts.
Link: https://lkml.kernel.org/r/20210805160019.1137-2-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0885a34197b7..8ce2620344b2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -319,6 +319,16 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long x;
long t;
+ /*
+ * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
+ * atomicity is provided by IRQs being disabled -- either explicitly
+ * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
+ * CPU migrations and preemption potentially corrupts a counter so
+ * disable preemption.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -328,6 +338,9 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -350,6 +363,10 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
delta >>= PAGE_SHIFT;
}
+ /* See __mod_node_page_state */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -359,6 +376,9 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -391,6 +411,10 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ /* See __mod_node_page_state */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -399,6 +423,9 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -409,6 +436,10 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+ /* See __mod_node_page_state */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -417,6 +448,9 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -437,6 +471,10 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ /* See __mod_node_page_state */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -445,6 +483,9 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -455,6 +496,10 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+ /* See __mod_node_page_state */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -463,6 +508,9 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
commit b34fbaa9289328c7aec67d2b8b8b7d02bc61c67d
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Jul 22 10:51:50 2018 -0400
random: remove preempt disabled region
No need to keep preemption disabled across the whole function.
mix_pool_bytes() uses a spin_lock() to protect the pool and there are
other places like write_pool() whhich invoke mix_pool_bytes() without
disabling preemption.
credit_entropy_bits() is invoked from other places like
add_hwgenerator_randomness() without disabling preemption.
Before commit 95b709b6be49 ("random: drop trickle mode") the function
used __this_cpu_inc_return() which would require disabled preemption.
The preempt_disable() section was added in commit 43d5d3018c37 ("[PATCH]
random driver preempt robustness", history tree). It was claimed that
the code relied on "vt_ioctl() being called under BKL".
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: enhance the commit message]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index f4013b8a711b..561082d46a82 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1131,8 +1131,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
} sample;
long delta, delta2, delta3;
- preempt_disable();
-
sample.jiffies = jiffies;
sample.cycles = random_get_entropy();
sample.num = num;
@@ -1170,8 +1168,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
* and limit entropy entimate to 12 bits.
*/
credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
-
- preempt_enable();
}
void add_input_randomness(unsigned int type, unsigned int code,
commit af3e0fcf78879f718c5f73df0814951bd7057d34
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed May 2 13:30:57 2018 +0200
8139too: Use disable_irq_nosync() in rtl8139_poll_controller()
Use disable_irq_nosync() instead of disable_irq() as this might be
called in atomic context with netpoll.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index d24b47b8e0b2..d118da5a10a2 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -2224,7 +2224,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
struct rtl8139_private *tp = netdev_priv(dev);
const int irq = tp->pci_dev->irq;
- disable_irq(irq);
+ disable_irq_nosync(irq);
rtl8139_interrupt(irq, dev);
enable_irq(irq);
}
commit ebade5e833eda30f648d9bf4954f7d47a920b60f
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Mar 7 14:58:39 2014 +0100
serial: 8250: Clean up the locking for -rt
In -RT the spin_lock_irqsave() does not spin but sleep if the lock is
taken. Before that, local_irq_save() is invoked which disables
interrupts even on -RT. Therefore local_irq_save() + spin_lock() does not
work.
In the ->sysrq and oops_in_progress case it is save to trylock the lock
i.e. this is what we do now anyway except for ->sysrq where we assume
that the lock is already taken.
The spin_lock_irqsave() grabs the lock and disables the interrupts on
vanilla (the same behavior) and on -RT it won't disable interrupts.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: add a patch description]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 747073b8c38a..81f909c2101f 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -2886,14 +2886,10 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
touch_nmi_watchdog();
- local_irq_save(flags);
- if (port->sysrq) {
- /* serial8250_handle_irq() already took the lock */
- locked = 0;
- } else if (oops_in_progress) {
- locked = spin_trylock(&port->lock);
- } else
- spin_lock(&port->lock);
+ if (port->sysrq || oops_in_progress)
+ locked = spin_trylock_irqsave(&port->lock, flags);
+ else
+ spin_lock_irqsave(&port->lock, flags);
/*
* First save the IER then disable the interrupts
@@ -2925,8 +2921,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
serial8250_modem_status(up);
if (locked)
- spin_unlock(&port->lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&port->lock, flags);
}
static int __init serial8250_console_setup(struct console *co, char *options)
commit c321f7d7c87cdc623c87845f6378620573e57512
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Feb 14 15:32:20 2014 +0100
drivers/net: tulip_remove_one needs to call pci_disable_device()
Otherwise the device is not completely shut down.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c
index add05f14b38b..1642de78aac8 100644
--- a/drivers/net/ethernet/dec/tulip/tulip_core.c
+++ b/drivers/net/ethernet/dec/tulip/tulip_core.c
@@ -1939,6 +1939,7 @@ static void tulip_remove_one(struct pci_dev *pdev)
pci_iounmap(pdev, tp->base_addr);
free_netdev (dev);
pci_release_regions (pdev);
+ pci_disable_device(pdev);
/* pci_power_off (pdev, -1); */
}
commit 5042afe7fe32390e79910ecd0a1f0563d4bca38c
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri Jul 3 08:29:30 2009 -0500
generic: Use raw local irq variant for generic cmpxchg
The interrupt disabled region is extremly tiny and therefor not
latency relevant. Avoid cluttering the traces with those pointless
entries.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h
index 2533fddd34a6..d8d4c898c1bb 100644
--- a/include/asm-generic/cmpxchg-local.h
+++ b/include/asm-generic/cmpxchg-local.h
@@ -21,7 +21,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
if (size == 8 && sizeof(unsigned long) != 8)
wrong_size_cmpxchg(ptr);
- local_irq_save(flags);
+ raw_local_irq_save(flags);
switch (size) {
case 1: prev = *(u8 *)ptr;
if (prev == old)
@@ -42,7 +42,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
default:
wrong_size_cmpxchg(ptr);
}
- local_irq_restore(flags);
+ raw_local_irq_restore(flags);
return prev;
}
@@ -55,11 +55,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr,
u64 prev;
unsigned long flags;
- local_irq_save(flags);
+ raw_local_irq_save(flags);
prev = *(u64 *)ptr;
if (prev == old)
*(u64 *)ptr = new;
- local_irq_restore(flags);
+ raw_local_irq_restore(flags);
return prev;
}
commit 8ed92e51f99c2199c64cb33b4ba95ab12940a94c
Author: Ingo Molnar <mingo@elte.hu>
Date: Sun Oct 14 14:28:50 2012 +0200
sched: Add WAKEUP_PREEMPTION feature flag, on by default
As per the recent discussion with Mike and Linus, make it easier to
test with/without this feature. No change in default behavior.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-izoxq4haeg4mTognnDbwcevt@git.kernel.org
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..f936552b3db1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2907,7 +2907,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
- if (unlikely(p->policy != SCHED_NORMAL))
+ if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..e68e69ab917d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -31,6 +31,11 @@ SCHED_FEAT(LAST_BUDDY, true)
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
+/*
+ * Allow wakeup-time preemption of the current task:
+ */
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
+
/*
* Use arch dependent cpu power functions
*/
commit 89c5bd08df5841326abbf167d136bcf14cf759ed
Merge: 762ad8a53303 eae7a755ee81
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Mar 14 18:49:05 2012 +0100
Merge tag 'perf-urgent-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/urgent
Some corner case fixes.
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
commit eae7a755ee81129370c8f555b0d5672e6673735d
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Mar 14 12:42:34 2012 -0300
perf tools, x86: Build perf on older user-space as well
On ancient systems I get this build failure:
util/../../../arch/x86/include/asm/unistd.h:67:29: error: asm/unistd_64.h: No such file or directory
In file included from util/cache.h:7,
from builtin-test.c:8:
util/../perf.h: In function ‘sys_perf_event_open’:In file included from util/../perf.h:16
perf.h:170: error: ‘__NR_perf_event_open’ undeclared (first use in this function)
The reason is that this old system does not have the split
unistd.h headers yet, from which to pick up the syscall
definitions.
Add the syscall numbers to the already existing i386 and x86_64
blocks in perf.h, and also provide empty include file stubs.
With this patch perf builds and works fine on 5 years old
user-space as well.
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Link: http://lkml.kernel.org/n/tip-jctwg64le1w47tuaoeyftsg9@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 7c12650165ae..8a4b9bccf8b2 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -249,6 +249,8 @@ LIB_H += util/include/asm/uaccess.h
LIB_H += util/include/dwarf-regs.h
LIB_H += util/include/asm/dwarf2.h
LIB_H += util/include/asm/cpufeature.h
+LIB_H += util/include/asm/unistd_32.h
+LIB_H += util/include/asm/unistd_64.h
LIB_H += perf.h
LIB_H += util/annotate.h
LIB_H += util/cache.h
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 16e7d20eee83..3afa39ac1d40 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -10,6 +10,9 @@ void get_term_dimensions(struct winsize *ws);
#define rmb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#define CPUINFO_PROC "model name"
+#ifndef __NR_perf_event_open
+# define __NR_perf_event_open 336
+#endif
#endif
#if defined(__x86_64__)
@@ -17,6 +20,9 @@ void get_term_dimensions(struct winsize *ws);
#define rmb() asm volatile("lfence" ::: "memory")
#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#define CPUINFO_PROC "model name"
+#ifndef __NR_perf_event_open
+# define __NR_perf_event_open 298
+#endif
#endif
#ifdef __powerpc__
diff --git a/tools/perf/util/include/asm/unistd_32.h b/tools/perf/util/include/asm/unistd_32.h
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/tools/perf/util/include/asm/unistd_32.h
@@ -0,0 +1 @@
+
diff --git a/tools/perf/util/include/asm/unistd_64.h b/tools/perf/util/include/asm/unistd_64.h
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/tools/perf/util/include/asm/unistd_64.h
@@ -0,0 +1 @@
+
commit c96a9876696d30783ad54399351a0bf3660db53f
Merge: d1f42e314c9c fde7d9049e55
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Mar 14 09:48:16 2012 +0100
Merge tag 'v3.3-rc7' into x86/platform
Merge reason: Update to the almost-final v3.3 kernel.
Signed-off-by: Ingo Molnar <mingo@elte.hu>