diff options
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 25 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_nocache_64.S | 25 | ||||
-rw-r--r-- | arch/x86/lib/delay_32.c | 31 | ||||
-rw-r--r-- | arch/x86/lib/delay_64.c | 30 |
4 files changed, 75 insertions, 36 deletions
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 70bebd310408..ee1c3f635157 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -217,19 +217,19 @@ ENTRY(copy_user_generic_unrolled) /* table sorted by exception address */ .section __ex_table,"a" .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e + .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */ + .quad .Ls2,.Ls1e + .quad .Ls3,.Ls1e + .quad .Ls4,.Ls1e + .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */ .quad .Ld2,.Ls2e .quad .Ld3,.Ls3e .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e + .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */ + .quad .Ls6,.Ls5e + .quad .Ls7,.Ls5e + .quad .Ls8,.Ls5e + .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */ .quad .Ld6,.Ls6e .quad .Ld7,.Ls7e .quad .Ld8,.Ls8e @@ -244,11 +244,8 @@ ENTRY(copy_user_generic_unrolled) .quad .Le5,.Le_zero .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax +.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */ .Ls2e: addl $8,%eax .Ls3e: addl $8,%eax .Ls4e: addl $8,%eax diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index 5196762b3b0e..9d3d1ab83763 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -145,19 +145,19 @@ ENTRY(__copy_user_nocache) /* table sorted by exception address */ .section __ex_table,"a" .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e + .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */ + .quad .Ls2,.Ls1e + .quad .Ls3,.Ls1e + .quad .Ls4,.Ls1e + .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */ .quad .Ld2,.Ls2e .quad .Ld3,.Ls3e .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e + .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */ + .quad .Ls6,.Ls5e + .quad .Ls7,.Ls5e + .quad .Ls8,.Ls5e + .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */ .quad .Ld6,.Ls6e .quad .Ld7,.Ls7e .quad .Ld8,.Ls8e @@ -172,11 +172,8 @@ ENTRY(__copy_user_nocache) .quad .Le5,.Le_zero .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax +.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */ .Ls2e: addl $8,%eax .Ls3e: addl $8,%eax .Ls4e: addl $8,%eax diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c index 4535e6d147ad..d710f2d167bb 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay_32.c @@ -44,13 +44,36 @@ static void delay_loop(unsigned long loops) static void delay_tsc(unsigned long loops) { unsigned long bclock, now; + int cpu; - preempt_disable(); /* TSC's are per-cpu */ + preempt_disable(); + cpu = smp_processor_id(); rdtscl(bclock); - do { - rep_nop(); + for (;;) { rdtscl(now); - } while ((now-bclock) < loops); + if ((now - bclock) >= loops) + break; + + /* Allow RT tasks to run */ + preempt_enable(); + rep_nop(); + preempt_disable(); + + /* + * It is possible that we moved to another CPU, and + * since TSC's are per-cpu we need to calculate + * that. The delay must guarantee that we wait "at + * least" the amount of time. Being moved to another + * CPU could make the wait longer but we just need to + * make sure we waited long enough. Rebalance the + * counter for this CPU. + */ + if (unlikely(cpu != smp_processor_id())) { + loops -= (now - bclock); + cpu = smp_processor_id(); + rdtscl(bclock); + } + } preempt_enable(); } diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c index bbc610518516..4c441be92641 100644 --- a/arch/x86/lib/delay_64.c +++ b/arch/x86/lib/delay_64.c @@ -31,14 +31,36 @@ int __devinit read_current_timer(unsigned long *timer_value) void __delay(unsigned long loops) { unsigned bclock, now; + int cpu; - preempt_disable(); /* TSC's are pre-cpu */ + preempt_disable(); + cpu = smp_processor_id(); rdtscl(bclock); - do { - rep_nop(); + for (;;) { rdtscl(now); + if ((now - bclock) >= loops) + break; + + /* Allow RT tasks to run */ + preempt_enable(); + rep_nop(); + preempt_disable(); + + /* + * It is possible that we moved to another CPU, and + * since TSC's are per-cpu we need to calculate + * that. The delay must guarantee that we wait "at + * least" the amount of time. Being moved to another + * CPU could make the wait longer but we just need to + * make sure we waited long enough. Rebalance the + * counter for this CPU. + */ + if (unlikely(cpu != smp_processor_id())) { + loops -= (now - bclock); + cpu = smp_processor_id(); + rdtscl(bclock); + } } - while ((now-bclock) < loops); preempt_enable(); } EXPORT_SYMBOL(__delay); |