/* ** The time to take a cube root from a 32 bits uint was: < 26 ns (mean: < 22 ns) new time: < 14 ns (mean: < 13 ns) The trick used for small square roots is used again: "unroll the loop". First step: A binary search, the "if" statements, large values ( >= 1u << 24 ) are found relatively faster, small values ( < 64 ) are handled during the search. Second step: A jump into the unrolled loop, the "labels". |--------------------------------| | cro32(x) | |-----|-----||------------|------| | x | ns || x | ns | |-----|-----||------------|------| | 0 | 2,4 || 511 | 4,6 | | 1 | 2,3 || 1023 | 5,3 | | 3 | 2,3 || 2047 | 5,3 | | 7 | 2,3 || 4095 | 5,0 | | 15 | 3,6 || 32768 | 7,2 | | 31 | 3,0 || 1048576 | 8,5 | | 63 | 3,0 || 268435456 | 12,0 | | 64 | 5,0 || 536870912 | 12,3 | heaviest case, | 65 | 5,0 || 1070599167 | 13,7 | <== 1070599167 is the | 100 | 5,0 || 1070599168 | 13,7 | 3th power of 1023, | 127 | 5,0 || 1073741824 | 12,3 | binary: 1111111111 | 128 | 5,0 || 1073774592 | 12,0 | | 255 | 4,6 || 2147483648 | 12,3 | | 256 | 4,6 || 4294967295 | 12,0 | |-----|-----||------------|------| 4294967295 roots in 53884 ms, mean time 12,55 ns, faster than square root! UPDATE 2013/11/04 50686 ms, mean time 11,80 ns, see: part 2 ** Original version: Figure 2: Architecture of the iteration (alternative A) A digit-by-digit algorithm for radix-2 cube root and its implementation (2004) J.-A.Pineiro, J.D.Bruguera, L.Ciminiera, P.Montuschi */ using System; using System.Diagnostics; class cro_32 { private static uint cro32(uint x) { uint y = 4u, z = 16u, b = 0u; if (x < 1u << 24) if (x < 1u << 12) if (x < 1u << 06) if (x < 1u << 03) return x == 0u ? 0u : 1u; else return x < 27u ? 2u : 3u; else if (x < 1u << 09) goto L8; else goto L7; else if (x < 1u << 18) if (x < 1u << 15) goto L6; else goto L5; else if (x < 1u << 21) goto L4; else goto L3; else if (x < 1u << 30) if (x < 1u << 27) goto L2; else goto L1; if (x >= 27u << 27) { x -= 27u << 27; z = 36u; y = 6u; } else { x -= 1u << 30; } goto M0; L1: if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; } else { x -= 1u << 27; } goto M1; L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2; L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3; L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4; L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5; L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6; L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7; L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8; M0: if (x >= 61u << 24) { x -= 61u << 24; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M1: b = (y + z) * 3 + 1u << 21; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M8: return x <= (y + z) * 3 ? y : y + 1u; } private static Stopwatch sw = new Stopwatch(); static void Main() { cro32(225); cro32(1070599167); check_perfect_cubes32(); time_cro32(); cro32_all(); Console.ReadLine(); } private static void check_perfect_cubes32() { uint n = 0, a = 0, b = 1, c = 6; while (n <= 1625) { if (n != cro32(a)) Console.WriteLine("WRONG"); a += b; b += c; c += 6; n++; } Console.WriteLine("CHECKED PERFECT CUBES"); } private static void time_cro32() { int i, j; uint x; double t; uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65, 100, 127, 128, 255, 256, 511, 1023, 2047, 4095, 1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167, 1070599168,1u << 30,(1u << 30) + (1u << 15), 1u << 31, ~0u }; Console.WriteLine("|------------|------|"); Console.WriteLine("| x | ns |"); Console.WriteLine("|------------|------|"); for (j = 0; j < a.Length; j++) { x = a[j]; cro32(x); sw.Restart(); for (i = 0; i < 10000000; i++) cro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (i = 0; i < 10000000; i++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10); } Console.WriteLine("|------------|------|"); Console.WriteLine(); } private static void cro32_all() { uint x; double t; cro32(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) cro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x); // 4294967295 roots in 53884 ms, mean time 12,55 ns } }
C# , System.Numerics, Multiplication, Karatsuba, Toom Cook, Division, Burnikel, Ziegler, Factorial, Luschny, Square Root, Zimmermann, Choose, Binomial Coefficient, Permutation, Combination, Eratosthenes, Primes, Fibonacci, Lucas, Pell, Catalan, Fast Random Number Generator, Overton
2013/09/26
Fast Cube Root ( < 2^32 )
2013/09/02
Fast Square Root ( < 2^32 )
/* The time to take a square root from a 32 bits uint was: ~35 ns, new time: <16 ns (mean: <14 ns). It's based on: A square root hardware algorithm from Guy L. Steele,Jr. The slightly optimized C# version below is nearly twice as fast. A few "goto Label" statements are used, they are fairly easy to remove, but I doubt it will be any faster (at all). |--------------------------------| | Times "cro32(x)" | | Athlon X4, XP, 2 GB | |-----|-----||------------|------| | x | ns || x | ns | |-----|-----||------------|------| | 0 | 1,6 || 511 | 4,3 | | 1 | 1,6 || 1023 | 4,6 | | 3 | 1,6 || 2047 | 5,0 | | 7 | 2,3 || 4095 | 5,6 | 4294967295 roots in 56962 ms, | 15 | 2,3 || 32768 | 6,6 | mean time 13,26 ns. | 31 | 3,6 || 1048576 | 10,0 | | 63 | 3,0 || 268435456 | 13,7 | | 64 | 4,3 || 536870912 | 11,4 | | 65 | 4,3 || 1070599167 | 13,0 | | 100 | 5,0 || 1073741824 | 15,1 | | 127 | 4,3 || 1073741824 | 15,1 | | 128 | 4,3 || 1073774592 | 15,1 | | 255 | 4,0 || 2147483648 | 13,1 | | 256 | 4,7 || 4294967295 | 15,7 | |-----|-----||------------|------| */ using System; using System.Diagnostics; class Square_Root { private static uint sro32(uint x) { uint y, z; if (x < 1u << 16) if (x < 1u << 08) if (x < 1u << 04) return x < 1u << 02 ? x + 3u >> 2 : x + 15u >> 3; else if (x < 1u << 06) { y = 1u << 03; x -= 1u << 04; if (x >= 5u << 02) { x -= 5u << 02; y |= 1u << 02; } goto L0; } else { y = 1u << 05; x -= 1u << 06; if (x >= 5u << 04) { x -= 5u << 04; y |= 1u << 04; } goto L1; } else // slower (on my pc): .... y = 3u << 04; } goto L1; } if (x < 1u << 12) if (x < 1u << 10) { y = 1u << 07; x -= 1u << 08; if (x >= 5u << 06) { x -= 5u << 06; y |= 1u << 06; } goto L2; } else { y = 1u << 09; x -= 1u << 10; if (x >= 5u << 08) { x -= 5u << 08; y |= 1u << 08; } goto L3; } else if (x < 1u << 14) { y = 1u << 11; x -= 1u << 12; if (x >= 5u << 10) { x -= 5u << 10; y |= 1u << 10; } goto L4; } else { y = 1u << 13; x -= 1u << 14; if (x >= 5u << 12) { x -= 5u << 12; y |= 1u << 12; } goto L5; } else if (x < 1u << 24) if (x < 1u << 20) if (x < 1u << 18) { y = 1u << 15; x -= 1u << 16; if (x >= 5u << 14) { x -= 5u << 14; y |= 1u << 14; } goto L6; } else { y = 1u << 17; x -= 1u << 18; if (x >= 5u << 16) { x -= 5u << 16; y |= 1u << 16; } goto L7; } else if (x < 1u << 22) { y = 1u << 19; x -= 1u << 20; if (x >= 5u << 18) { x -= 5u << 18; y |= 1u << 18; } goto L8; } else { y = 1u << 21; x -= 1u << 22; if (x >= 5u << 20) { x -= 5u << 20; y |= 1u << 20; } goto L9; } else if (x < 1u << 28) if (x < 1u << 26) { y = 1u << 23; x -= 1u << 24; if (x >= 5u << 22) { x -= 5u << 22; y |= 1u << 22; } goto La; } else { y = 1u << 25; x -= 1u << 26; if (x >= 5u << 24) { x -= 5u << 24; y |= 1u << 24; } goto Lb; } else if (x < 1u << 30) { y = 1u << 27; x -= 1u << 28; if (x >= 5u << 26) { x -= 5u << 26; y |= 1u << 26; } goto Lc; } else { y = 1u << 29; x -= 1u << 30; if (x >= 5u << 28) { x -= 5u << 28; y |= 1u << 28; } } z = y | 1u << 26; y /= 2; if (x >= z) { x -= z; y |= 1u << 26; } Lc: z = y | 1u << 24; y /= 2; if (x >= z) { x -= z; y |= 1u << 24; } Lb: z = y | 1u << 22; y /= 2; if (x >= z) { x -= z; y |= 1u << 22; } La: z = y | 1u << 20; y /= 2; if (x >= z) { x -= z; y |= 1u << 20; } L9: z = y | 1u << 18; y /= 2; if (x >= z) { x -= z; y |= 1u << 18; } L8: z = y | 1u << 16; y /= 2; if (x >= z) { x -= z; y |= 1u << 16; } L7: z = y | 1u << 14; y /= 2; if (x >= z) { x -= z; y |= 1u << 14; } L6: z = y | 1u << 12; y /= 2; if (x >= z) { x -= z; y |= 1u << 12; } L5: z = y | 1u << 10; y /= 2; if (x >= z) { x -= z; y |= 1u << 10; } L4: z = y | 1u << 08; y /= 2; if (x >= z) { x -= z; y |= 1u << 08; } L3: z = y | 1u << 06; y /= 2; if (x >= z) { x -= z; y |= 1u << 06; } L2: z = y | 1u << 04; y /= 2; if (x >= z) { x -= z; y |= 1u << 04; } L1: z = y | 1u << 02; y /= 2; if (x >= z) { x -= z; y |= 1u << 02; } L0: return x > y ? y / 2 | 1u : y / 2; } private static Stopwatch sw = new Stopwatch(); static void Main() { Console.WriteLine(sro32(4)); Console.WriteLine(sro32(40)); Console.WriteLine(sro32(400)); Console.WriteLine(sro32(4000)); Console.WriteLine(sro32(40000)); Console.WriteLine(sro32(400000)); Console.WriteLine(sro32(4000000)); Console.WriteLine(sro32(40000000)); Console.WriteLine(sro32(400000000)); Console.WriteLine(sro32(4000000000)); test_perfect_squares(); time_sro32(); sro32_all(); // ~1 minute Console.ReadLine(); } private static void test_perfect_squares() { uint n = 0, a = 0, b = 1; do { if (sro32(a) != n) Console.WriteLine("WRONG"); a += b; b += 2; } while (n++ < 65535); Console.WriteLine("Perfect squares tested"); } private static void time_sro32() { int i, j; uint x; double t; //uint[] a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65, 100, 127, 128, 255, 256, 511, 1023, 2047, 4095, 1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167, 1u << 30,1u << 30,(1u << 30) + (1u << 15), 1u << 31, 0xfffe0001u, ~0u }; Console.WriteLine("|------------|------|"); Console.WriteLine("| x | ns |"); Console.WriteLine("|------------|------|"); for (j = 0; j < a.Length; j++) { x = a[j]; sro32(x); sw.Restart(); for (i = 0; i < 10000000; i++) sro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (i = 0; i < 10000000; i++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10); } Console.WriteLine("|------------|------|"); Console.WriteLine(); } private static void sro32_all() { uint x; double t; sro32(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) sro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", (double)(t) * 1000000 / x); // 4294967295 roots in 56962 ms, mean time 13,26 ns } }
//------------------------------------------------------------ // Another one, slower, the line "t = ..." is done many times. // // private static uint sro32s(uint x) // uint y1 = 0, y2 = 0, t; int k = 15; // for (; k >= 0; k--) // { // t = y2 + (y1 << k + 1) | (1u << k * 2); // if (x >= t) { y2 = t; y1 |= 1u << k; } // } // return y1; //-----------------------------------------------------------
Subscribe to:
Posts (Atom)