Patches contributed by Eötvös Lorand University


commit 28e7d82d6325734c7ecd1f223b892701f7a8c171
Merge: c99dbbe9f8f6 cef30b3a84e1
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jan 19 17:12:20 2009 +0100

    Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/travis/linux-2.6-cpus4096-for-ingo into cpus4096

commit 7890ba8c87604ca4c2c73f7de846bf5305d743e4
Merge: 99937d6455ce b2b062b81633
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jan 19 12:36:09 2009 +0100

    Merge branch 'stackprotector' into core/percpu

commit 4092762aebfe55c1f8e31440b80a053c2dbe519b
Merge: 745b1626dd71 1de9e8e70f5a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Jan 18 20:15:05 2009 +0100

    Merge branch 'tracing/ftrace'; commit 'v2.6.29-rc2' into tracing/core

commit 5662a2f8e7313f78d6b17ab383f3e4f04971c335
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Jan 18 19:37:21 2009 +0100

    x86, rdc321x: remove/move leftover files
    
    Impact: cleanup
    
    Move/remove leftover RDC321 files. Now that it's not a subarch anymore,
    arch/x86/mach-rdc321x and arch/x86/include/asm/mach-rdc321x/ are not
    needed.
    
    One include file was still in use: rdc321x_defs.h, move that to the
    generic x86 asm header directory.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h
deleted file mode 100644
index c210ab5788b0..000000000000
--- a/arch/x86/include/asm/mach-rdc321x/gpio.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _ASM_X86_MACH_RDC321X_GPIO_H
-#define _ASM_X86_MACH_RDC321X_GPIO_H
-
-#include <linux/kernel.h>
-
-extern int rdc_gpio_get_value(unsigned gpio);
-extern void rdc_gpio_set_value(unsigned gpio, int value);
-extern int rdc_gpio_direction_input(unsigned gpio);
-extern int rdc_gpio_direction_output(unsigned gpio, int value);
-extern int rdc_gpio_request(unsigned gpio, const char *label);
-extern void rdc_gpio_free(unsigned gpio);
-extern void __init rdc321x_gpio_setup(void);
-
-/* Wrappers for the arch-neutral GPIO API */
-
-static inline int gpio_request(unsigned gpio, const char *label)
-{
-	return rdc_gpio_request(gpio, label);
-}
-
-static inline void gpio_free(unsigned gpio)
-{
-	might_sleep();
-	rdc_gpio_free(gpio);
-}
-
-static inline int gpio_direction_input(unsigned gpio)
-{
-	return rdc_gpio_direction_input(gpio);
-}
-
-static inline int gpio_direction_output(unsigned gpio, int value)
-{
-	return rdc_gpio_direction_output(gpio, value);
-}
-
-static inline int gpio_get_value(unsigned gpio)
-{
-	return rdc_gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
-	rdc_gpio_set_value(gpio, value);
-}
-
-static inline int gpio_to_irq(unsigned gpio)
-{
-	return gpio;
-}
-
-static inline int irq_to_gpio(unsigned irq)
-{
-	return irq;
-}
-
-/* For cansleep */
-#include <asm-generic/gpio.h>
-
-#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
similarity index 100%
rename from arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
rename to arch/x86/include/asm/rdc321x_defs.h
diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile
deleted file mode 100644
index 8325b4ca431c..000000000000
--- a/arch/x86/mach-rdc321x/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the RDC321x specific parts of the kernel
-#
-obj-$(CONFIG_X86_RDC321X)        := gpio.o platform.o
-
diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c
deleted file mode 100644
index 247f33d3a407..000000000000
--- a/arch/x86/mach-rdc321x/gpio.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- *  GPIO support for RDC SoC R3210/R8610
- *
- *  Copyright (C) 2007, Florian Fainelli <florian@openwrt.org>
- *  Copyright (C) 2008, Volker Weiss <dev@tintuc.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-
-#include <linux/spinlock.h>
-#include <linux/io.h>
-#include <linux/types.h>
-#include <linux/module.h>
-
-#include <asm/gpio.h>
-#include <asm/mach-rdc321x/rdc321x_defs.h>
-
-
-/* spin lock to protect our private copy of GPIO data register plus
-   the access to PCI conf registers. */
-static DEFINE_SPINLOCK(gpio_lock);
-
-/* copy of GPIO data registers */
-static u32 gpio_data_reg1;
-static u32 gpio_data_reg2;
-
-static u32 gpio_request_data[2];
-
-
-static inline void rdc321x_conf_write(unsigned addr, u32 value)
-{
-	outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
-	outl(value, RDC3210_CFGREG_DATA);
-}
-
-static inline void rdc321x_conf_or(unsigned addr, u32 value)
-{
-	outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
-	value |= inl(RDC3210_CFGREG_DATA);
-	outl(value, RDC3210_CFGREG_DATA);
-}
-
-static inline u32 rdc321x_conf_read(unsigned addr)
-{
-	outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
-
-	return inl(RDC3210_CFGREG_DATA);
-}
-
-/* configure pin as GPIO */
-static void rdc321x_configure_gpio(unsigned gpio)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-	rdc321x_conf_or(gpio < 32
-		? RDC321X_GPIO_CTRL_REG1 : RDC321X_GPIO_CTRL_REG2,
-		1 << (gpio & 0x1f));
-	spin_unlock_irqrestore(&gpio_lock, flags);
-}
-
-/* initially setup the 2 copies of the gpio data registers.
-   This function must be called by the platform setup code. */
-void __init rdc321x_gpio_setup()
-{
-	/* this might not be, what others (BIOS, bootloader, etc.)
-	   wrote to these registers before, but it's a good guess. Still
-	   better than just using 0xffffffff. */
-
-	gpio_data_reg1 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG1);
-	gpio_data_reg2 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG2);
-}
-
-/* determine, if gpio number is valid */
-static inline int rdc321x_is_gpio(unsigned gpio)
-{
-	return gpio <= RDC321X_MAX_GPIO;
-}
-
-/* request GPIO */
-int rdc_gpio_request(unsigned gpio, const char *label)
-{
-	unsigned long flags;
-
-	if (!rdc321x_is_gpio(gpio))
-		return -EINVAL;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-	if (gpio_request_data[(gpio & 0x20) ? 1 : 0] & (1 << (gpio & 0x1f)))
-		goto inuse;
-	gpio_request_data[(gpio & 0x20) ? 1 : 0] |= (1 << (gpio & 0x1f));
-	spin_unlock_irqrestore(&gpio_lock, flags);
-
-	return 0;
-inuse:
-	spin_unlock_irqrestore(&gpio_lock, flags);
-	return -EINVAL;
-}
-EXPORT_SYMBOL(rdc_gpio_request);
-
-/* release previously-claimed GPIO */
-void rdc_gpio_free(unsigned gpio)
-{
-	unsigned long flags;
-
-	if (!rdc321x_is_gpio(gpio))
-		return;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-	gpio_request_data[(gpio & 0x20) ? 1 : 0] &= ~(1 << (gpio & 0x1f));
-	spin_unlock_irqrestore(&gpio_lock, flags);
-}
-EXPORT_SYMBOL(rdc_gpio_free);
-
-/* read GPIO pin */
-int rdc_gpio_get_value(unsigned gpio)
-{
-	u32 reg;
-	unsigned long flags;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-	reg = rdc321x_conf_read(gpio < 32
-		? RDC321X_GPIO_DATA_REG1 : RDC321X_GPIO_DATA_REG2);
-	spin_unlock_irqrestore(&gpio_lock, flags);
-
-	return (1 << (gpio & 0x1f)) & reg ? 1 : 0;
-}
-EXPORT_SYMBOL(rdc_gpio_get_value);
-
-/* set GPIO pin to value */
-void rdc_gpio_set_value(unsigned gpio, int value)
-{
-	unsigned long flags;
-	u32 reg;
-
-	reg = 1 << (gpio & 0x1f);
-	if (gpio < 32) {
-		spin_lock_irqsave(&gpio_lock, flags);
-		if (value)
-			gpio_data_reg1 |= reg;
-		else
-			gpio_data_reg1 &= ~reg;
-		rdc321x_conf_write(RDC321X_GPIO_DATA_REG1, gpio_data_reg1);
-		spin_unlock_irqrestore(&gpio_lock, flags);
-	} else {
-		spin_lock_irqsave(&gpio_lock, flags);
-		if (value)
-			gpio_data_reg2 |= reg;
-		else
-			gpio_data_reg2 &= ~reg;
-		rdc321x_conf_write(RDC321X_GPIO_DATA_REG2, gpio_data_reg2);
-		spin_unlock_irqrestore(&gpio_lock, flags);
-	}
-}
-EXPORT_SYMBOL(rdc_gpio_set_value);
-
-/* configure GPIO pin as input */
-int rdc_gpio_direction_input(unsigned gpio)
-{
-	if (!rdc321x_is_gpio(gpio))
-		return -EINVAL;
-
-	rdc321x_configure_gpio(gpio);
-
-	return 0;
-}
-EXPORT_SYMBOL(rdc_gpio_direction_input);
-
-/* configure GPIO pin as output and set value */
-int rdc_gpio_direction_output(unsigned gpio, int value)
-{
-	if (!rdc321x_is_gpio(gpio))
-		return -EINVAL;
-
-	gpio_set_value(gpio, value);
-	rdc321x_configure_gpio(gpio);
-
-	return 0;
-}
-EXPORT_SYMBOL(rdc_gpio_direction_output);
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
deleted file mode 100644
index 4f4e50c3ad3b..000000000000
--- a/arch/x86/mach-rdc321x/platform.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  Generic RDC321x platform devices
- *
- *  Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version 2
- *  of the License, or (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the
- *  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- *  Boston, MA  02110-1301, USA.
- *
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/leds.h>
-
-#include <asm/gpio.h>
-
-/* LEDS */
-static struct gpio_led default_leds[] = {
-	{ .name = "rdc:dmz", .gpio = 1, },
-};
-
-static struct gpio_led_platform_data rdc321x_led_data = {
-	.num_leds = ARRAY_SIZE(default_leds),
-	.leds = default_leds,
-};
-
-static struct platform_device rdc321x_leds = {
-	.name = "leds-gpio",
-	.id = -1,
-	.dev = {
-		.platform_data = &rdc321x_led_data,
-	}
-};
-
-/* Watchdog */
-static struct platform_device rdc321x_wdt = {
-	.name = "rdc321x-wdt",
-	.id = -1,
-	.num_resources = 0,
-};
-
-static struct platform_device *rdc321x_devs[] = {
-	&rdc321x_leds,
-	&rdc321x_wdt
-};
-
-static int __init rdc_board_setup(void)
-{
-	rdc321x_gpio_setup();
-
-	return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs));
-}
-
-arch_initcall(rdc_board_setup);
diff --git a/drivers/watchdog/rdc321x_wdt.c b/drivers/watchdog/rdc321x_wdt.c
index bf92802f2bbe..36e221beedcd 100644
--- a/drivers/watchdog/rdc321x_wdt.c
+++ b/drivers/watchdog/rdc321x_wdt.c
@@ -37,7 +37,7 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 
-#include <asm/mach-rdc321x/rdc321x_defs.h>
+#include <asm/rdc321x_defs.h>
 
 #define RDC_WDT_MASK	0x80000000 /* Mask */
 #define RDC_WDT_EN	0x00800000 /* Enable bit */

commit b2b062b8163391c42b3219d466ca1ac9742b9c7b
Merge: a9de18eb761f 99937d6455ce
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Jan 18 18:37:14 2009 +0100

    Merge branch 'core/percpu' into stackprotector
    
    Conflicts:
            arch/x86/include/asm/pda.h
            arch/x86/include/asm/system.h
    
    Also, moved include/asm-x86/stackprotector.h to arch/x86/include/asm.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --cc arch/x86/include/asm/pda.h
index 3fea2fdb3302,c31ca048a901..5976cd803e9a
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@@ -9,114 -11,30 +11,28 @@@
  
  /* Per processor datastructure. %gs points to it while the kernel runs */
  struct x8664_pda {
- 	struct task_struct *pcurrent;	/* 0  Current process */
- 	unsigned long data_offset;	/* 8 Per cpu data offset from linker
- 					   address */
- 	unsigned long kernelstack;	/* 16 top of kernel stack for current */
- 	unsigned long oldrsp;		/* 24 user rsp for system call */
- 	int irqcount;			/* 32 Irq nesting counter. Starts -1 */
- 	unsigned int cpunumber;		/* 36 Logical CPU number */
+ 	unsigned long unused1;
+ 	unsigned long unused2;
+ 	unsigned long unused3;
+ 	unsigned long unused4;
+ 	int unused5;
+ 	unsigned int unused6;		/* 36 was cpunumber */
 -#ifdef CONFIG_CC_STACKPROTECTOR
  	unsigned long stack_canary;	/* 40 stack canary value */
  					/* gcc-ABI: this canary MUST be at
  					   offset 40!!! */
- 	char *irqstackptr;
- 	short nodenumber;		/* number of current node (32k max) */
 -#endif
  	short in_bootmem;		/* pda lives in bootmem */
- 	unsigned int __softirq_pending;
- 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
- 	short mmu_state;
- 	short isidle;
- 	struct mm_struct *active_mm;
- 	unsigned apic_timer_irqs;
- 	unsigned irq0_irqs;
- 	unsigned irq_resched_count;
- 	unsigned irq_call_count;
- 	unsigned irq_tlb_count;
- 	unsigned irq_thermal_count;
- 	unsigned irq_threshold_count;
- 	unsigned irq_spurious_count;
  } ____cacheline_aligned_in_smp;
  
- extern struct x8664_pda **_cpu_pda;
+ DECLARE_PER_CPU(struct x8664_pda, __pda);
  extern void pda_init(int);
  
- #define cpu_pda(i) (_cpu_pda[i])
+ #define cpu_pda(cpu)		(&per_cpu(__pda, cpu))
  
- /*
-  * There is no fast way to get the base address of the PDA, all the accesses
-  * have to mention %fs/%gs.  So it needs to be done this Torvaldian way.
-  */
- extern void __bad_pda_field(void) __attribute__((noreturn));
- 
- /*
-  * proxy_pda doesn't actually exist, but tell gcc it is accessed for
-  * all PDA accesses so it gets read/write dependencies right.
-  */
- extern struct x8664_pda _proxy_pda;
- 
- #define pda_offset(field) offsetof(struct x8664_pda, field)
- 
- #define pda_to_op(op, field, val)					\
- do {									\
- 	typedef typeof(_proxy_pda.field) T__;				\
- 	if (0) { T__ tmp__; tmp__ = (val); }	/* type checking */	\
- 	switch (sizeof(_proxy_pda.field)) {				\
- 	case 2:								\
- 		asm(op "w %1,%%gs:%c2" :				\
- 		    "+m" (_proxy_pda.field) :				\
- 		    "ri" ((T__)val),					\
- 		    "i"(pda_offset(field)));				\
- 		break;							\
- 	case 4:								\
- 		asm(op "l %1,%%gs:%c2" :				\
- 		    "+m" (_proxy_pda.field) :				\
- 		    "ri" ((T__)val),					\
- 		    "i" (pda_offset(field)));				\
- 		break;							\
- 	case 8:								\
- 		asm(op "q %1,%%gs:%c2":					\
- 		    "+m" (_proxy_pda.field) :				\
- 		    "ri" ((T__)val),					\
- 		    "i"(pda_offset(field)));				\
- 		break;							\
- 	default:							\
- 		__bad_pda_field();					\
- 	}								\
- } while (0)
- 
- #define pda_from_op(op, field)			\
- ({						\
- 	typeof(_proxy_pda.field) ret__;		\
- 	switch (sizeof(_proxy_pda.field)) {	\
- 	case 2:					\
- 		asm(op "w %%gs:%c1,%0" :	\
- 		    "=r" (ret__) :		\
- 		    "i" (pda_offset(field)),	\
- 		    "m" (_proxy_pda.field));	\
- 		break;				\
- 	case 4:					\
- 		asm(op "l %%gs:%c1,%0":		\
- 		    "=r" (ret__):		\
- 		    "i" (pda_offset(field)),	\
- 		    "m" (_proxy_pda.field));	\
- 		break;				\
- 	case 8:					\
- 		asm(op "q %%gs:%c1,%0":		\
- 		    "=r" (ret__) :		\
- 		    "i" (pda_offset(field)),	\
- 		    "m" (_proxy_pda.field));	\
- 		break;				\
- 	default:				\
- 		__bad_pda_field();		\
- 	}					\
- 	ret__;					\
- })
- 
- #define read_pda(field)		pda_from_op("mov", field)
- #define write_pda(field, val)	pda_to_op("mov", field, val)
- #define add_pda(field, val)	pda_to_op("add", field, val)
- #define sub_pda(field, val)	pda_to_op("sub", field, val)
- #define or_pda(field, val)	pda_to_op("or", field, val)
+ #define read_pda(field)		percpu_read(__pda.field)
+ #define write_pda(field, val)	percpu_write(__pda.field, val)
+ #define add_pda(field, val)	percpu_add(__pda.field, val)
+ #define sub_pda(field, val)	percpu_sub(__pda.field, val)
+ #define or_pda(field, val)	percpu_or(__pda.field, val)
  
  /* This is not atomic against other CPUs -- CPU preemption needs to be off */
  #define test_and_clear_bit_pda(bit, field)				\
@@@ -130,8 -42,4 +40,6 @@@
  
  #endif
  
- #define PDA_STACKOFFSET (5*8)
- 
 +#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary)
 +
  #endif /* _ASM_X86_PDA_H */
diff --cc arch/x86/include/asm/stackprotector.h
index 3baf7ad89be1,000000000000..c7f0d10bae7b
mode 100644,000000..100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@@ -1,38 -1,0 +1,39 @@@
 +#ifndef _ASM_STACKPROTECTOR_H
 +#define _ASM_STACKPROTECTOR_H 1
 +
 +#include <asm/tsc.h>
++#include <asm/pda.h>
 +
 +/*
 + * Initialize the stackprotector canary value.
 + *
 + * NOTE: this must only be called from functions that never return,
 + * and it must always be inlined.
 + */
 +static __always_inline void boot_init_stack_canary(void)
 +{
 +	u64 canary;
 +	u64 tsc;
 +
 +	/*
 +	 * If we're the non-boot CPU, nothing set the PDA stack
 +	 * canary up for us - and if we are the boot CPU we have
 +	 * a 0 stack canary. This is a good place for updating
 +	 * it, as we wont ever return from this function (so the
 +	 * invalid canaries already on the stack wont ever
 +	 * trigger).
 +	 *
 +	 * We both use the random pool and the current TSC as a source
 +	 * of randomness. The TSC only matters for very early init,
 +	 * there it already has some randomness on most systems. Later
 +	 * on during the bootup the random pool has true entropy too.
 +	 */
 +	get_random_bytes(&canary, sizeof(canary));
 +	tsc = __native_read_tsc();
 +	canary += tsc + (tsc << 32UL);
 +
 +	current->stack_canary = canary;
 +	write_pda(stack_canary, canary);
 +}
 +
 +#endif
diff --cc arch/x86/include/asm/system.h
index 2f6340a44291,d1dc27dba36d..8cadfe9b1194
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@@ -94,9 -94,7 +94,9 @@@ do {									
  	     "call __switch_to\n\t"					  \
  	     ".globl thread_return\n"					  \
  	     "thread_return:\n\t"					  \
- 	     "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"			  \
+ 	     "movq "__percpu_arg([current_task])",%%rsi\n\t"		  \
 +	     "movq %P[task_canary](%%rsi),%%r8\n\t"			  \
 +	     "movq %%r8,%%gs:%P[pda_canary]\n\t"			  \
  	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
  	     LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"	  \
  	     "movq %%rax,%%rdi\n\t" 					  \
@@@ -108,9 -106,7 +108,9 @@@
  	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \
  	       [tif_fork] "i" (TIF_FORK),			  	  \
  	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
 -	       [current_task] "m" (per_cpu_var(current_task))		  \
 +	       [task_canary] "i" (offsetof(struct task_struct, stack_canary)),\
- 	       [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)), \
++	       [current_task] "m" (per_cpu_var(current_task)),		  \
 +	       [pda_canary] "i" (offsetof(struct x8664_pda, stack_canary))\
  	     : "memory", "cc" __EXTRA_CLOBBER)
  #endif
  
diff --cc arch/x86/kernel/process_64.c
index efb0396e19bf,4523ff88a69d..aa89eabf09e0
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -625,14 -619,15 +631,14 @@@ __switch_to(struct task_struct *prev_p
  	/*
  	 * Switch the PDA and FPU contexts.
  	 */
- 	prev->usersp = read_pda(oldrsp);
- 	write_pda(oldrsp, next->usersp);
- 	write_pda(pcurrent, next_p);
+ 	prev->usersp = percpu_read(old_rsp);
+ 	percpu_write(old_rsp, next->usersp);
+ 	percpu_write(current_task, next_p);
  
- 	write_pda(kernelstack,
+ 	percpu_write(kernel_stack,
  		  (unsigned long)task_stack_page(next_p) +
- 		  THREAD_SIZE - PDA_STACKOFFSET);
+ 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
  #ifdef CONFIG_CC_STACKPROTECTOR
 -	write_pda(stack_canary, next_p->stack_canary);
  	/*
  	 * Build time only check to make sure the stack_canary is at
  	 * offset 40 in the pda; this is a gcc ABI requirement
diff --cc init/main.c
index 07da4dea50c3,844209453c02..bfe4fb0c9842
--- a/init/main.c
+++ b/init/main.c
@@@ -561,15 -537,8 +538,14 @@@ asmlinkage void __init start_kernel(voi
  	 * Need to run as early as possible, to initialize the
  	 * lockdep hash:
  	 */
- 	unwind_init();
  	lockdep_init();
  	debug_objects_early_init();
 +
 +	/*
 +	 * Set up the the initial canary ASAP:
 +	 */
 +	boot_init_stack_canary();
 +
  	cgroup_init_early();
  
  	local_irq_disable();

commit af37501c792107c2bde1524bdae38d9a247b841a
Merge: d859e29fe34c 99937d6455ce
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Jan 18 18:15:49 2009 +0100

    Merge branch 'core/percpu' into perfcounters/core
    
    Conflicts:
            arch/x86/include/asm/pda.h
    
    We merge tip/core/percpu into tip/perfcounters/core because of a
    semantic and contextual conflict: the former eliminates the PDA,
    while the latter extends it with apic_perf_irqs field.
    
    Resolve the conflict by moving the new field to the irq_cpustat
    structure on 64-bit too.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --cc arch/x86/include/asm/hardirq_64.h
index b5a6b5d56704,a65bab20f6ce..42930b279215
--- a/arch/x86/include/asm/hardirq_64.h
+++ b/arch/x86/include/asm/hardirq_64.h
@@@ -3,9 -3,23 +3,24 @@@
  
  #include <linux/threads.h>
  #include <linux/irq.h>
- #include <asm/pda.h>
  #include <asm/apic.h>
  
+ typedef struct {
+ 	unsigned int __softirq_pending;
+ 	unsigned int __nmi_count;	/* arch dependent */
+ 	unsigned int apic_timer_irqs;	/* arch dependent */
++	unsigned int apic_perf_irqs;	/* arch dependent */
+ 	unsigned int irq0_irqs;
+ 	unsigned int irq_resched_count;
+ 	unsigned int irq_call_count;
+ 	unsigned int irq_tlb_count;
+ 	unsigned int irq_thermal_count;
+ 	unsigned int irq_spurious_count;
+ 	unsigned int irq_threshold_count;
+ } ____cacheline_aligned irq_cpustat_t;
+ 
+ DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+ 
  /* We can have at most NR_VECTORS irqs routed to a cpu at a time */
  #define MAX_HARDIRQS_PER_CPU NR_VECTORS
  

commit 99937d6455cea95405ac681c86a857d0fcd530bd
Merge: 74e7904559a1 87b264065880
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Jan 18 17:41:32 2009 +0100

    Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu

commit 74296a8ed6aa3c5bf672808ada690de7ba323ecc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Jan 16 17:43:50 2009 +0100

    irq: provide debug_poll_all_shared_irqs() method under CONFIG_DEBUG_SHIRQ
    
    Provide a shared interrupt debug facility under CONFIG_DEBUG_SHIRQ:
    it uses the existing irqpoll facilities to iterate through all
    registered interrupt handlers and call those which can handle shared
    IRQ lines.
    
    This can be handy for suspend/resume debugging: if we call this function
    early during resume we can trigger crashes in those drivers which have
    incorrect assumptions about when exactly their ISRs will be called
    during suspend/resume.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..468e3a25a4a1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -462,6 +462,12 @@ static inline void init_irq_proc(void)
 }
 #endif
 
+#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ)
+extern void debug_poll_all_shared_irqs(void);
+#else
+static inline void debug_poll_all_shared_irqs(void) { }
+#endif
+
 int show_interrupts(struct seq_file *p, void *v);
 
 struct irq_desc;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..4d568294de3e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
 	return ok;
 }
 
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_all_shared_irqs(void)
 {
 	struct irq_desc *desc;
 	int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
 
 		try_one_irq(i, desc);
 	}
+}
+
+static void poll_spurious_irqs(unsigned long dummy)
+{
+	poll_all_shared_irqs();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
 
+#ifdef CONFIG_DEBUG_SHIRQ
+void debug_poll_all_shared_irqs(void)
+{
+	poll_all_shared_irqs();
+}
+#endif
+
 /*
  * If 99,900 of the previous 100,000 interrupts have not been handled
  * then assume that the IRQ is stuck in some manner. Drop a diagnostic

commit 5a2dd72abdae75ea2960145e0549635ce4e0be96
Merge: efdc64f0c792 7cb36b6ccdca
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Jan 16 17:46:22 2009 +0100

    Merge branch 'linus' into irq/genirq

commit 6dbde3530850d4d8bfc1b6bd4006d92786a2787f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Jan 15 22:15:53 2009 +0900

    percpu: add optimized generic percpu accessors
    
    It is an optimization and a cleanup, and adds the following new
    generic percpu methods:
    
      percpu_read()
      percpu_write()
      percpu_add()
      percpu_sub()
      percpu_and()
      percpu_or()
      percpu_xor()
    
    and implements support for them on x86. (other architectures will fall
    back to a default implementation)
    
    The advantage is that for example to read a local percpu variable,
    instead of this sequence:
    
     return __get_cpu_var(var);
    
     ffffffff8102ca2b:      48 8b 14 fd 80 09 74    mov    -0x7e8bf680(,%rdi,8),%rdx
     ffffffff8102ca32:      81
     ffffffff8102ca33:      48 c7 c0 d8 59 00 00    mov    $0x59d8,%rax
     ffffffff8102ca3a:      48 8b 04 10             mov    (%rax,%rdx,1),%rax
    
    We can get a single instruction by using the optimized variants:
    
     return percpu_read(var);
    
     ffffffff8102ca3f:      65 48 8b 05 91 8f fd    mov    %gs:0x7efd8f91(%rip),%rax
    
    I also cleaned up the x86-specific APIs and made the x86 code use
    these new generic percpu primitives.
    
    tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
        * added percpu_and() for completeness's sake
        * made generic percpu ops atomic against preemption
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f8d672..0728480f5c56 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,7 +10,7 @@ struct task_struct;
 DECLARE_PER_CPU(struct task_struct *, current_task);
 static __always_inline struct task_struct *get_current(void)
 {
-	return x86_read_percpu(current_task);
+	return percpu_read(current_task);
 }
 
 #else /* X86_32 */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
index 86afd7473457..d7ed33ee94e9 100644
--- a/arch/x86/include/asm/irq_regs_32.h
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs);
 
 static inline struct pt_regs *get_irq_regs(void)
 {
-	return x86_read_percpu(irq_regs);
+	return percpu_read(irq_regs);
 }
 
 static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
 	struct pt_regs *old_regs;
 
 	old_regs = get_irq_regs();
-	x86_write_percpu(irq_regs, new_regs);
+	percpu_write(irq_regs, new_regs);
 
 	return old_regs;
 }
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
index 7e98ce1d2c0e..08b53454f831 100644
--- a/arch/x86/include/asm/mmu_context_32.h
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -4,8 +4,8 @@
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
-	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
 #endif
 }
 
@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev,
 		/* stop flush ipis for the previous mm */
 		cpu_clear(cpu, prev->cpu_vm_mask);
 #ifdef CONFIG_SMP
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		x86_write_percpu(cpu_tlbstate.active_mm, next);
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		percpu_write(cpu_tlbstate.active_mm, next);
 #endif
 		cpu_set(cpu, next->cpu_vm_mask);
 
@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev,
 	}
 #ifdef CONFIG_SMP
 	else {
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
 
 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
 			/* We were in lazy tlb mode and leave_mm disabled
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index e3d3a081d798..47f274fe6953 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -45,11 +45,11 @@ extern void pda_init(int);
 
 #define cpu_pda(cpu)		(&per_cpu(__pda, cpu))
 
-#define read_pda(field)		x86_read_percpu(__pda.field)
-#define write_pda(field, val)	x86_write_percpu(__pda.field, val)
-#define add_pda(field, val)	x86_add_percpu(__pda.field, val)
-#define sub_pda(field, val)	x86_sub_percpu(__pda.field, val)
-#define or_pda(field, val)	x86_or_percpu(__pda.field, val)
+#define read_pda(field)		percpu_read(__pda.field)
+#define write_pda(field, val)	percpu_write(__pda.field, val)
+#define add_pda(field, val)	percpu_add(__pda.field, val)
+#define sub_pda(field, val)	percpu_sub(__pda.field, val)
+#define or_pda(field, val)	percpu_or(__pda.field, val)
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
 #define test_and_clear_bit_pda(bit, field)				\
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 328b31a429d7..03aa4b00a1c3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -40,16 +40,11 @@
 
 #ifdef CONFIG_SMP
 #define __percpu_seg_str	"%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset		x86_read_percpu(this_cpu_off)
+#define __my_cpu_offset		percpu_read(this_cpu_off)
 #else
 #define __percpu_seg_str
 #endif
 
-#include <asm-generic/percpu.h>
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
 /* For arch-specific code, we can use direct single-insn ops (they
  * don't give an lvalue though). */
 extern void __bad_percpu_size(void);
@@ -115,11 +110,13 @@ do {							\
 	ret__;						\
 })
 
-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var)	percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val)	percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val)	percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val)	percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_and(var, val)	percpu_to_op("and", per_cpu__##var, val)
+#define percpu_or(var, val)	percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val)	percpu_to_op("xor", per_cpu__##var, val)
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
 #define x86_test_and_clear_bit_percpu(bit, var)				\
@@ -131,6 +128,11 @@ do {							\
 	old__;								\
 })
 
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
 #ifdef CONFIG_X86_64
 extern void load_pda_offset(int cpu);
 #else
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 127415402ea1..c7bbbbe65d3f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -160,7 +160,7 @@ extern unsigned disabled_cpus __cpuinitdata;
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
 extern int safe_smp_processor_id(void);
 
 #elif defined(CONFIG_X86_64_SMP)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..77d546817d94 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -591,7 +591,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 
-	x86_write_percpu(current_task, next_p);
+	percpu_write(current_task, next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index ec53818f4e38..e65449d0f7d9 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,8 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
  */
 void leave_mm(int cpu)
 {
-	BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
-	cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
+	BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK);
+	cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
 	load_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -103,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
 		 * BUG();
 		 */
 
-	if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
-		if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
+	if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
 			if (flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
@@ -222,7 +222,7 @@ static void do_flush_tlb_all(void *info)
 	unsigned long cpu = smp_processor_id();
 
 	__flush_tlb_all();
-	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 		leave_mm(cpu);
 }
 
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 1a48368acb09..96f15b09a4c5 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -402,7 +402,7 @@ void __init find_smp_config(void)
 	     VOYAGER_SUS_IN_CONTROL_PORT);
 
 	current_thread_info()->cpu = boot_cpu_id;
-	x86_write_percpu(cpu_number, boot_cpu_id);
+	percpu_write(cpu_number, boot_cpu_id);
 }
 
 /*
@@ -1782,7 +1782,7 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
 void __init smp_setup_processor_id(void)
 {
 	current_thread_info()->cpu = hard_smp_processor_id();
-	x86_write_percpu(cpu_number, hard_smp_processor_id());
+	percpu_write(cpu_number, hard_smp_processor_id());
 }
 
 static void voyager_send_call_func(cpumask_t callmask)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 312414ef9365..75b94139e1f2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -695,17 +695,17 @@ static void xen_write_cr0(unsigned long cr0)
 
 static void xen_write_cr2(unsigned long cr2)
 {
-	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+	percpu_read(xen_vcpu)->arch.cr2 = cr2;
 }
 
 static unsigned long xen_read_cr2(void)
 {
-	return x86_read_percpu(xen_vcpu)->arch.cr2;
+	return percpu_read(xen_vcpu)->arch.cr2;
 }
 
 static unsigned long xen_read_cr2_direct(void)
 {
-	return x86_read_percpu(xen_vcpu_info.arch.cr2);
+	return percpu_read(xen_vcpu_info.arch.cr2);
 }
 
 static void xen_write_cr4(unsigned long cr4)
@@ -718,12 +718,12 @@ static void xen_write_cr4(unsigned long cr4)
 
 static unsigned long xen_read_cr3(void)
 {
-	return x86_read_percpu(xen_cr3);
+	return percpu_read(xen_cr3);
 }
 
 static void set_current_cr3(void *v)
 {
-	x86_write_percpu(xen_current_cr3, (unsigned long)v);
+	percpu_write(xen_current_cr3, (unsigned long)v);
 }
 
 static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -748,7 +748,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 
 	if (kernel) {
-		x86_write_percpu(xen_cr3, cr3);
+		percpu_write(xen_cr3, cr3);
 
 		/* Update xen_current_cr3 once the batch has actually
 		   been submitted. */
@@ -764,7 +764,7 @@ static void xen_write_cr3(unsigned long cr3)
 
 	/* Update while interrupts are disabled, so its atomic with
 	   respect to ipis */
-	x86_write_percpu(xen_cr3, cr3);
+	percpu_write(xen_cr3, cr3);
 
 	__xen_write_cr3(true, cr3);
 
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..2e8271431e1a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
 	struct vcpu_info *vcpu;
 	unsigned long flags;
 
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 
 	/* flag has opposite sense of mask */
 	flags = !vcpu->evtchn_upcall_mask;
@@ -62,7 +62,7 @@ static void xen_restore_fl(unsigned long flags)
 	   make sure we're don't switch CPUs between getting the vcpu
 	   pointer and updating the mask. */
 	preempt_disable();
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 	vcpu->evtchn_upcall_mask = flags;
 	preempt_enable_no_resched();
 
@@ -83,7 +83,7 @@ static void xen_irq_disable(void)
 	   make sure we're don't switch CPUs between getting the vcpu
 	   pointer and updating the mask. */
 	preempt_disable();
-	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+	percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
 
@@ -96,7 +96,7 @@ static void xen_irq_enable(void)
 	   the caller is confused and is trying to re-enable interrupts
 	   on an indeterminate processor. */
 
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 	vcpu->evtchn_upcall_mask = 0;
 
 	/* Doesn't matter if we get preempted here, because any
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..7bc7852cc5c4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1074,7 +1074,7 @@ static void drop_other_mm_ref(void *info)
 
 	/* If this cpu still has a stale cr3 reference, then make sure
 	   it has been flushed. */
-	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+	if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
 		load_cr3(swapper_pg_dir);
 		arch_flush_lazy_cpu_mode();
 	}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 858938241616..e786fa7f2615 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -39,7 +39,7 @@ static inline void xen_mc_issue(unsigned mode)
 		xen_mc_flush();
 
 	/* restore flags saved in xen_mc_batch */
-	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+	local_irq_restore(percpu_read(xen_mc_irq_flags));
 }
 
 /* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 83fa4236477d..3bfd6dd0b47c 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -78,7 +78,7 @@ static __cpuinit void cpu_bringup(void)
 	xen_setup_cpu_clockevents();
 
 	cpu_set(cpu, cpu_online_map);
-	x86_write_percpu(cpu_state, CPU_ONLINE);
+	percpu_write(cpu_state, CPU_ONLINE);
 	wmb();
 
 	/* We can take interrupts now: we're officially "up". */
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..00f45ff081a6 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
 #define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
 					__typeof__(type) per_cpu_var(name)
 
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * @var can be a percpu variable or a field of it and its size should
+ * equal char, int or long.  percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var().  Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var)						\
+  ({									\
+	typeof(per_cpu_var(var)) __tmp_var__;				\
+	__tmp_var__ = get_cpu_var(var);					\
+	put_cpu_var(var);						\
+	__tmp_var__;							\
+  })
+#endif
+
+#define __percpu_generic_to_op(var, val, op)				\
+do {									\
+	get_cpu_var(var) op val;					\
+	put_cpu_var(var);						\
+} while (0)
+
+#ifndef percpu_write
+# define percpu_write(var, val)		__percpu_generic_to_op(var, (val), =)
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val)		__percpu_generic_to_op(var, (val), +=)
+#endif
+
+#ifndef percpu_sub
+# define percpu_sub(var, val)		__percpu_generic_to_op(var, (val), -=)
+#endif
+
+#ifndef percpu_and
+# define percpu_and(var, val)		__percpu_generic_to_op(var, (val), &=)
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val)		__percpu_generic_to_op(var, (val), |=)
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val)		__percpu_generic_to_op(var, (val), ^=)
+#endif
+
 #endif /* _ASM_GENERIC_PERCPU_H_ */