/* Source code of System.Random is available at: CodePlex/Random.cs System.Random is fast, basically a new random number is generated by a subtraction of two numbers. A straightforward copy of the source code is two times slower compared to using rand.Next directly. A way to speed it up: the codeplex code uses a multiplication by a double type. The multiplication can be replaced by a shift and an "if" statement when 30 random bits are needed, very occasionally an addition "+ 1" (and an "& 1") are necessary, that's what "Next30()" does in, voila: Fast Random Range (CodePlex) Other improvements over the previous version: -a 64 bits buffer, instead of a 32 bits one. Operations like "uint + uint" are as fast as "ulong + uint", same for: " - , ^ , | , << , >> " -Random numbers were computed bit by bit, now the bits are taken at once, some bit twiddling recycles unused random bits. Finally: A probably very true scaled (pseudo) random number generator, based on the .Net4 RNGCryptoServiceProvider class: ".. rng = new RNGCryptoServiceProvider". The "rng.Getbytes(..)" method fills a byte array, one byte as fast as 40 bytes, 40 bytes seems to be an optimum. "Buffer.BlockCopy" copies the byte array to a uint array. The uint array is used to fill a 64 random bits buffer, etc. "uint x = g_uint_c(uint u)" gives a random value for x, such that: 0 <= x <= u <= uint.MaxValue Results: 10^7 times rand.Next() takes: 107 ms (~11 ns per random number) 10^7 times rand.Next(int someValue) takes: 236 ms (~24 ns ...) |--------|-------------------------| rand.Next() 11 ns | | ms 10^7 * | rand.Next(i) 24 ns | |-----------|-------------| gen_uint(u) 50 ns, mean ~35 ns | u |gen_uint(u)| g_uint_c(u) | g_uint_c(u) 280 ns, mean ~250 ns |--------|-----|-----|------|------| | 0 | 24 | | 24 | | | 1 | | 73 | | 139 | | 2 | 124 | | 296 | | | 3 | | 78 | | 211 | | 4 | 242 | | 518 | | | 7 | | 83 | | 282 | | 8 | 290 | | 680 | | | 2^7-1 | | 102 | | 562 | | 2^7-0 | 380 | | 1079 | | | 2^15-1 | | 130 | | 1123 | | 2^15-0 | 414 | | 1634 | | | 2^30-1 | | 193 | | 2164 | | 2^30-0 | 494 | | 2672 | | | 2^31-1 | | 210 | | 2251 | | 2^31-0 | 461 | | 2747 | | | 2^32-1 | | 211 | | 2307 | |--------|-----|-----|------|------| http://www.cacert.at/cgi-bin/rngresults find: "P_P" , hover mouse over "Net4 System.Random" for the code. Random ~ gen_uint(u) CSP ~ g_uint_c(u) |--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------| | Gene- | Speed | Upload | Entropy | Birthday | Matrix | 6*8 Ma- | Minimum | Random | Squeeze | Overlap- | | rator | (MiB/s) | Size | (->8) | Spacing | Ranks | trix R | Distance | Spheres | | ping Sums| |--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------| | Random | 370 | 17 MiB | 7.999989 | 0.000000 | 0.392 | 0.136 | 0.063920 | 0.905097 | 0.098440 | 0.005467 | | CSP | 34 | 15 MiB | 7.999987 | 0.032102 | 0.084 | 0.942 | 0.894277 | 0.036221 | 0.706538 | 0.023523 | |--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------| */ using System; using System.Diagnostics; using System.Security.Cryptography; class scaled_random_uint { private static ulong buf = 0; // 64 random bits buffer private static int av = 0; // available bits in buf private static uint g_u = 0; // copy of u private static int g_nd = 0; // nd (needed) bits = highest set bit of u private static int g_s = 0; // shift private static bool ones = false; // true if u = 10, 110, 1110, ... private static uint g_uint_c(uint u) { uint x; int s; if (u == 0) return 0; if (u != g_u) { g_u = u; g_nd = nob(u); g_s = 64 - g_nd; ones = (u + 1 & u + 2) == 0; } L0: if (av < g_nd) { av += 32; buf <<= 32; buf |= next32(); } x = (uint)buf << g_s >> g_s; buf >>= g_nd; av -= g_nd; if (x <= u) return x; if (ones) goto L0; // faster when u = 2, 6, 14, ... s = nob(x ^ u) - 1; buf <<= s; buf ^= x; av += s; goto L0; } private static RNGCryptoServiceProvider rng = new RNGCryptoServiceProvider(); private static byte[] r08 = new byte[40]; // ... [400] ... ~15% faster private static uint[] r32 = new uint[10]; // ... [100] ... ,, ,, private static int i_r32 = 10; // ... = 100 ... ,, ,, private static uint next32() { if (i_r32 != 10) return r32[i_r32++]; // ... != 100... ,, ,, i_r32 = 1; rng.GetBytes(r08); Buffer.BlockCopy(r08, 0, r32, 0, 40); // ... , 400)... ,, ,, return r32[0]; } private static int nob(uint u) // number of bits, position of the highest set bit, u > 0 { return u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 : u < 1u << 03 ? 03 : 04 : u < 1u << 06 ? u < 1u << 05 ? 05 : 06 : u < 1u << 07 ? 07 : 08 : u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 : u < 1u << 11 ? 11 : 12 : u < 1u << 14 ? u < 1u << 13 ? 13 : 14 : u < 1u << 15 ? 15 : 16 : u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 : u < 1u << 19 ? 19 : 20 : u < 1u << 22 ? u < 1u << 21 ? 21 : 22 : u < 1u << 23 ? 23 : 24 : u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 : u < 1u << 27 ? 27 : 28 : u < 1u << 30 ? u < 1u << 29 ? 29 : 30 : u < 1u << 31 ? 31 : 32; } private static Stopwatch sw = new Stopwatch(); static void Main() { speed((1u << 31) - 0); Console.ReadLine(); } private static void speed(uint u) { int i, j = 10000000; double t; sw.Restart(); for (i = 0; i < j; i++) g_uint_c(u); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (i = 0; i < j; i++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.WriteLine("u: " + u); Console.WriteLine("gen_uint(u)"); Console.WriteLine(" ms: " + t); Console.WriteLine(" bits: " + (double)(nob(u)) * j); Console.WriteLine(" bits/s: {0:.000}", (double)((nob(u)) * j) / t / 1000000); Console.WriteLine(); } }
C# , System.Numerics, Multiplication, Karatsuba, Toom Cook, Division, Burnikel, Ziegler, Factorial, Luschny, Square Root, Zimmermann, Choose, Binomial Coefficient, Permutation, Combination, Eratosthenes, Primes, Fibonacci, Lucas, Pell, Catalan, Fast Random Number Generator, Overton
2013/11/17
Uniform random numbers in a range, part 2
2013/11/04
Fast Cube Root ( < 2^32 ), part 2
![]() |
/* The mean time to take a cube root from a 32 bits uint was: < 13 ns new mean time: < 12 ns It is a bit faster, especially for large values, because variables are initialized properly. |--------------------------------| | cro32(x) | |-----|-----||------------|------| | x | ns || x | ns | |-----|-----||------------|------| | 0 | 3,0 || 511 | 4,6 | | 1 | 2,3 || 1023 | 5,3 | | 3 | 2,3 || 2047 | 5,6 | ___ | 7 | 2,3 || 4095 | 5,0 | \3 / | 15 | 3,3 || 32768 | 7,6 | \/ x = y | 31 | 2,6 || 1048576 | 9,0 | | 63 | 2,6 || 268435456 | 11,0 | Input: 0 <= x < 2^32 | 64 | 5,0 || 536870912 | 11,3 | Output: y, such that y^3 <= x < (y+1)^3 | 65 | 5,0 || 1070599167 | 13,0 | | 100 | 5,0 || 1070599168 | 13,0 | | 127 | 5,0 || 1073741824 | 11,3 | | 128 | 5,3 || 1073774592 | 11,3 | | 255 | 4,6 || 2147483648 | 11,0 | | 256 | 4,6 || 4294967295 | 11,7 | |-----|-----||------------|------| 4294967295 roots in 50686 ms, mean time 11,80 ns (before 12,55 ns) Another way to find roots is using a look-up table. Below "fill_cubes" fills a "cubes" array, "cro32t" does a binary search for a root. Mean time: 30 ns. I have not tried to optimize it further, I see no reason why it should be much faster. */ using System; using System.Diagnostics; class cube_root { private static uint cro32(uint x) { uint y = 4u, z = 16u, b = 61u << 21; if (x < 1u << 24) if (x < 1u << 12) if (x < 1u << 06) if (x < 1u << 03) return x == 0u ? 0u : 1u; else return x < 27u ? 2u : 3u; else if (x < 1u << 09) goto L8; else goto L7; else if (x < 1u << 18) if (x < 1u << 15) goto L6; else goto L5; else if (x < 1u << 21) goto L4; else goto L3; else if (x < 1u << 30) if (x < 1u << 27) goto L2; else { if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; b = 127u << 21; } else { x -= 1u << 27; } } else { if (x >= 27u << 27) { x -= 27u << 27; z = 144u; y = 12u; b = 469u << 21; } else { if (x >= 125u << 24) { x -= 125u << 24; z = 100u; y = 10u; b = 331u << 21; } else { x -= 1u << 30; z = 64u; y = 8u; b = 217u << 21; } } } goto M1; L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2; L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3; L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4; L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5; L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6; L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7; L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8; M1: if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M8: return x <= (y + z) * 3 ? y : y + 1u; } private static uint[] cubes = new uint[2048]; private static void fill_cubes() { int i = 0; uint a = 0u, b = 1u, c = 6u; do { cubes[i++] = a; a += b; b += c; c += 6u; } while (i < 1626); do cubes[i++] = ~0u; while (i < 2048); } private static uint cro32t(uint x) { uint i = 1u << 10, j = 1u << 9, u = 1u << 30; for (; j > 0; j >>= 1) { if (x < u) i -= j; else if (x > u) i += j; else return i < 1625u ? i : 1625u; u = cubes[i]; } return x < u ? i - 1 : i; } private static Stopwatch sw = new Stopwatch(); static void Main(string[] args) { cro32_all(); // ~1 minute, comment out "cro32t" , uint[] cubes, etc. // Or try seperate executables. fill_cubes(); cro32t_all(); // ~2 minutes Console.ReadLine(); } private static void cro32_all() { uint x; double t; cro32(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) cro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x); // 4294967295 roots in 50686 ms, mean time 11,80 ns } private static void cro32t_all() { uint x; double t; cro32t(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) cro32t(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x); // 4294967295 roots in 129254 ms, mean time 30,09 ns } }
2013/10/19
Uniform pseudo random numbers in a range
/* Random number generators (RNG) are divided in two categories, hardware RNG, that provide “true” random numbers, and algorithmic RNG, that generate pseudo random numbers (PRNG). Both types usually generate random numbers "X" as independent uniform samples in a range 0, . . . 2^b − 1, with b = 8, 16, 32 or b = 64. In applications, it is instead sometimes desirable to draw random numbers as independent uniform samples "U" in a range " 1, . . . M ", where moreover "M" may change between drawings. Transforming "X" to "U" , or many "X" to many "U" is sometimes known as scaling. Some text is copied from other sources. You find them in "References". I found it quite hard to explain some of the underlying ideas. I am grateful to the original authors. Their words saved me a lot of time. Limits: 0 <= M <= 2^32 - 1 = uint.MaxValue 0 <= U <= M UPDATE: part 2 Example:
Suppose a PRNG produces one random bit each time it is called, and we have: M = 5 , binary: M = 101. Call the generator three times to get three successive bits. Result: 0 <= U <= 7, binary: 000 <= U <= 111. If "U > M" , than we take the next three bits from the generator, and repeat to do so until "U <= M". Another way to find a valid value for "U". -Take the first, the most significant, bit from the generator. If it's zero, take two other bits from the generator, and return a valid value for "U": 000 or 001 or 010 or 011. If the first bit isn't zero, so it's one, take the second bit from the generator. If the second bit is zero, take the third bit from the generator, and return a valid value for "U": 100 or 101. Finally: the first and the second bit are both set, no need to take a third bit from the generator, a random bit is saved, we have to try again. -Take the first, the most significant, bit from the generator... / U = 0.. return 000 or 001 or 010 or 011 / ---< U = 10. return 100 or 101 | \ | \ U = 11 --try again-- | | | | ----<<--------<<--------<<---- The PRNG used: .Net 4.0 System.Random It might have a bug: The period is not guaranteed to be longer than: 2^55-1 (I do have some doubts, but no proof) Some important properties of System.Random: Random Constructors: Random() Initializes a new instance of the Random class, using a time-dependent value. Random(int seed) Initializes a new instance of the Random class, using the specified seed. To improve performance, create one Random object to generate many random numbers over time, instead of repeatedly creating new Random objects to generate one random number, so: private static Random rand = new Random() Random Methods: rand.Next() Returns a positive random number, greater than or equal to zero, less than int.MaxValue rand.Next(int max) Returns a positive random number, greater than or equal to zero, less than max ( max >= 0 ) , ........ rand.Next(0) returns zero rand.Next(int min, int max) Returns a positive random number, greater than or equal to min, less than max ( max >= min ) , ...... rand.Next() is about twice as fast as rand.Next(int max) , rand.Next(int max) is almost as fast as rand.Next(int min, int max), especially when you use something like: return min + rand.Next(max - min) It's impossible to create more than 30 random bits at once. To create those 30 bits, use: rand.Next(1 << 30) Various sources warn about the lower bits of numbers generated by PRNG's, they shouldn't be random. One way to examine a PRNG is to create a visualisation of the numbers it produces. Humans are really good at spotting patterns, and visualisation allows you to use your eyes and brain directly for this purpose. While you shouldn't consider this type of approach an exhaustive or formal analysis, it is a nice and quick way to get a rough impression of a given generator's performance. The background of the image at the top looks pretty random, no recognizable patterns. It was created with the code at the very bottom: "random_bitmap()" Description of the code: gen_nd_bits(int nd) returns the needed amount of random bits. It uses a 32 bits buffer. The buffer is replenished when necessary. gen_uint_fast(uint u) returns a random value between zero (incl.) and u (incl.) It calls gen_nd_bits(int nd) . When the value is too large another call is done. gen_uint_eff(uint u) uses random bits more efficiently. It works bit by bit, from most to least significant bit. When it becomes clear that the (partial) formed value is smaller than the upper limit ( uint u ) , remaining needed random bits are added relatively fast. When the (partial) formed value is too large another call is done. On my trustworthy PC the average (full range) results are: |-----------------------|---------------| | function | * 10^9 bits/s | |-----------------------|---------------| | rand.Next() | 2.9 | <-- 1 CD in 2 seconds! | rand.Next(1 << 30) | 1.5 | | gen_nd_bits(int nd) | 1.2 | | gen_uint_fast(uint u) | 0.9 | | gen_uint_eff(uint u) | 0.7 | |-----------------------|---------------| The efficiency, random bits used per bit: gen_uint_fast(uint u) 1.386 bits/bit, remarkably close to ln(4) gen_uint_eff(uint u) 1.044 bits/bit References: José's Daylight Dices RANDOM.ORG Ask Dr. Math, Probability and Random Numbers Bit recycling for scaling random number generators Math.NET Numerics MSDN Bug ? System.Random (january 2011) */ using System; using System.Drawing; // add a ref! class scaled_random { private static Random rand = new Random(); private static uint buf = 0; // random bits buffer private static int av = 0; // available bits in buf private static uint g_u = 0; // copy of u private static int g_nd = 0; // copy of nd (needed) = highest set bit of u private static uint gen_nd_bits(int nd) // generate nd bits, 0 <= nd <= 32 { uint x; if (nd == 0) return 0; if (av <= 2) // buffer should contain at least 2 bits, other- { // wise there may be too few bits when nd > 30 buf <<= 30; buf |= (uint)rand.Next(1 << 30); av += 30; } if (av < nd) { x = buf; buf = (uint)rand.Next(1 << 30); nd -= av; x <<= nd; x |= buf << 32 - nd >> 32 - nd; buf >>= nd; av = 30 - nd; } else { x = buf << 32 - nd >> 32 - nd; buf >>= nd; av -= nd; } return x; } private static uint gen_uint_fast(uint u) // 0 <= returned value <= u { uint x; if (u == 0) return 0; if (u != g_u) // if u changes, g_nd might change { g_u = u; g_nd = nob(u); // needed: number of bits of u } L0: x = gen_nd_bits(g_nd); if (x <= u) return x; goto L0; } private static bool ones = false; // true if u is (binary) 1, 11, 111, 1111, ... private static uint gen_uint_eff(uint u) // 0 <= returned value <= u { uint x; int nd; if (u == 0) return 0; if (u != g_u) { g_u = u; g_nd = nob(u); ones = (u + 1 & u) == 0; } if (ones) return gen_nd_bits(g_nd); L0: nd = g_nd - 1; if (gen_nd_bits(1) == 0) return gen_nd_bits(nd); x = 1u << nd; nd--; while (nd > 0) { x |= gen_nd_bits(1) << nd; if (x >> nd < u >> nd) return x | gen_nd_bits(nd); if (x > u) goto L0; nd--; } x |= gen_nd_bits(1); if (x <= u) return x; goto L0; } private static int nob(uint u) // number of bits, position of the highest set bit, u > 0 { // afaik fastest way in C# return u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 : u < 1u << 03 ? 03 : 04 : u < 1u << 06 ? u < 1u << 05 ? 05 : 06 : u < 1u << 07 ? 07 : 08 : u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 : u < 1u << 11 ? 11 : 12 : u < 1u << 14 ? u < 1u << 13 ? 13 : 14 : u < 1u << 15 ? 15 : 16 : u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 : u < 1u << 19 ? 19 : 20 : u < 1u << 22 ? u < 1u << 21 ? 21 : 22 : u < 1u << 23 ? 23 : 24 : u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 : u < 1u << 27 ? 27 : 28 : u < 1u << 30 ? u < 1u << 29 ? 29 : 30 : u < 1u << 31 ? 31 : 32; } static void Main() { test_gen_uint_fast(17); test_gen_uint_eff(54); test_4_buckets_fast(123456789); test_4_buckets_eff(~0u); random_bitmap(); } private static void test_gen_uint_fast(uint u) { int i; int[] c = new int[u + 1]; for (i = 0; i < 1000000; i++) c[gen_uint_fast(u)]++; for (i = 0; i <= u; i++) Console.WriteLine("{0,3}{1,7}", i, c[i]); Console.ReadLine(); Console.Clear(); } private static void test_gen_uint_eff(uint u) { int i; int[] c = new int[u + 1]; for (i = 0; i < 1000000; i++) c[gen_uint_eff(u)]++; for (i = 0; i <= u; i++) Console.WriteLine("{0,3}{1,7}", i, c[i]); Console.ReadLine(); Console.Clear(); } private static void test_4_buckets_fast(uint u) // rounding errors! { uint x; int i, c0 = 0, c1 = 0, c2 = 0, c3 = 0; for (i = 0; i < 1000000; i++) { x = gen_uint_fast(u); if (x < u / 4) c0++; else if (x < u / 2) c1++; else if (x < u / 2 + u / 4) c2++; else c3++; } Console.WriteLine(c0); Console.WriteLine(c1); Console.WriteLine(c2); Console.WriteLine(c3); Console.ReadLine(); Console.Clear(); } private static void test_4_buckets_eff(uint u) { uint x; int i, c0 = 0, c1 = 0, c2 = 0, c3 = 0; for (i = 0; i < 1000000; i++) { x = gen_uint_eff(u); if (x < u / 4) c0++; else if (x < u / 2) c1++; else if (x < u / 2 + u / 4) c2++; else c3++; } Console.WriteLine(c0); Console.WriteLine(c1); Console.WriteLine(c2); Console.WriteLine(c3); Console.ReadLine(); Console.Clear(); } private static void random_bitmap() { int xmax = 900; int ymax = 900; Bitmap bmp = new Bitmap(xmax + 1, ymax + 1); int x, y, z; for (x = 0; x <= xmax; x++) { for (y = 0; y <= ymax; y++) { z = 7 & rand.Next(1 << 30); switch (z) { case 0: bmp.SetPixel(x, y, Color.Yellow); break; case 1: bmp.SetPixel(x, y, Color.Orange); break; case 2: bmp.SetPixel(x, y, Color.Purple); break; case 3: bmp.SetPixel(x, y, Color.White); break; case 4: bmp.SetPixel(x, y, Color.Green); break; case 5: bmp.SetPixel(x, y, Color.Black); break; case 6: bmp.SetPixel(x, y, Color.Blue); break; case 7: bmp.SetPixel(x, y, Color.Red); break; } } } bmp.Save(xmax.ToString() + ".bmp"); } }
2013/09/26
Fast Cube Root ( < 2^32 )
/* ** The time to take a cube root from a 32 bits uint was: < 26 ns (mean: < 22 ns) new time: < 14 ns (mean: < 13 ns) The trick used for small square roots is used again: "unroll the loop". First step: A binary search, the "if" statements, large values ( >= 1u << 24 ) are found relatively faster, small values ( < 64 ) are handled during the search. Second step: A jump into the unrolled loop, the "labels". |--------------------------------| | cro32(x) | |-----|-----||------------|------| | x | ns || x | ns | |-----|-----||------------|------| | 0 | 2,4 || 511 | 4,6 | | 1 | 2,3 || 1023 | 5,3 | | 3 | 2,3 || 2047 | 5,3 | | 7 | 2,3 || 4095 | 5,0 | | 15 | 3,6 || 32768 | 7,2 | | 31 | 3,0 || 1048576 | 8,5 | | 63 | 3,0 || 268435456 | 12,0 | | 64 | 5,0 || 536870912 | 12,3 | heaviest case, | 65 | 5,0 || 1070599167 | 13,7 | <== 1070599167 is the | 100 | 5,0 || 1070599168 | 13,7 | 3th power of 1023, | 127 | 5,0 || 1073741824 | 12,3 | binary: 1111111111 | 128 | 5,0 || 1073774592 | 12,0 | | 255 | 4,6 || 2147483648 | 12,3 | | 256 | 4,6 || 4294967295 | 12,0 | |-----|-----||------------|------| 4294967295 roots in 53884 ms, mean time 12,55 ns, faster than square root! UPDATE 2013/11/04 50686 ms, mean time 11,80 ns, see: part 2 ** Original version: Figure 2: Architecture of the iteration (alternative A) A digit-by-digit algorithm for radix-2 cube root and its implementation (2004) J.-A.Pineiro, J.D.Bruguera, L.Ciminiera, P.Montuschi */ using System; using System.Diagnostics; class cro_32 { private static uint cro32(uint x) { uint y = 4u, z = 16u, b = 0u; if (x < 1u << 24) if (x < 1u << 12) if (x < 1u << 06) if (x < 1u << 03) return x == 0u ? 0u : 1u; else return x < 27u ? 2u : 3u; else if (x < 1u << 09) goto L8; else goto L7; else if (x < 1u << 18) if (x < 1u << 15) goto L6; else goto L5; else if (x < 1u << 21) goto L4; else goto L3; else if (x < 1u << 30) if (x < 1u << 27) goto L2; else goto L1; if (x >= 27u << 27) { x -= 27u << 27; z = 36u; y = 6u; } else { x -= 1u << 30; } goto M0; L1: if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; } else { x -= 1u << 27; } goto M1; L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2; L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3; L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4; L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5; L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6; L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7; L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8; M0: if (x >= 61u << 24) { x -= 61u << 24; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M1: b = (y + z) * 3 + 1u << 21; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M8: return x <= (y + z) * 3 ? y : y + 1u; } private static Stopwatch sw = new Stopwatch(); static void Main() { cro32(225); cro32(1070599167); check_perfect_cubes32(); time_cro32(); cro32_all(); Console.ReadLine(); } private static void check_perfect_cubes32() { uint n = 0, a = 0, b = 1, c = 6; while (n <= 1625) { if (n != cro32(a)) Console.WriteLine("WRONG"); a += b; b += c; c += 6; n++; } Console.WriteLine("CHECKED PERFECT CUBES"); } private static void time_cro32() { int i, j; uint x; double t; uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65, 100, 127, 128, 255, 256, 511, 1023, 2047, 4095, 1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167, 1070599168,1u << 30,(1u << 30) + (1u << 15), 1u << 31, ~0u }; Console.WriteLine("|------------|------|"); Console.WriteLine("| x | ns |"); Console.WriteLine("|------------|------|"); for (j = 0; j < a.Length; j++) { x = a[j]; cro32(x); sw.Restart(); for (i = 0; i < 10000000; i++) cro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (i = 0; i < 10000000; i++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10); } Console.WriteLine("|------------|------|"); Console.WriteLine(); } private static void cro32_all() { uint x; double t; cro32(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) cro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x); // 4294967295 roots in 53884 ms, mean time 12,55 ns } }
2013/09/02
Fast Square Root ( < 2^32 )
/* The time to take a square root from a 32 bits uint was: ~35 ns, new time: <16 ns (mean: <14 ns). It's based on: A square root hardware algorithm from Guy L. Steele,Jr. The slightly optimized C# version below is nearly twice as fast. A few "goto Label" statements are used, they are fairly easy to remove, but I doubt it will be any faster (at all). |--------------------------------| | Times "cro32(x)" | | Athlon X4, XP, 2 GB | |-----|-----||------------|------| | x | ns || x | ns | |-----|-----||------------|------| | 0 | 1,6 || 511 | 4,3 | | 1 | 1,6 || 1023 | 4,6 | | 3 | 1,6 || 2047 | 5,0 | | 7 | 2,3 || 4095 | 5,6 | 4294967295 roots in 56962 ms, | 15 | 2,3 || 32768 | 6,6 | mean time 13,26 ns. | 31 | 3,6 || 1048576 | 10,0 | | 63 | 3,0 || 268435456 | 13,7 | | 64 | 4,3 || 536870912 | 11,4 | | 65 | 4,3 || 1070599167 | 13,0 | | 100 | 5,0 || 1073741824 | 15,1 | | 127 | 4,3 || 1073741824 | 15,1 | | 128 | 4,3 || 1073774592 | 15,1 | | 255 | 4,0 || 2147483648 | 13,1 | | 256 | 4,7 || 4294967295 | 15,7 | |-----|-----||------------|------| */ using System; using System.Diagnostics; class Square_Root { private static uint sro32(uint x) { uint y, z; if (x < 1u << 16) if (x < 1u << 08) if (x < 1u << 04) return x < 1u << 02 ? x + 3u >> 2 : x + 15u >> 3; else if (x < 1u << 06) { y = 1u << 03; x -= 1u << 04; if (x >= 5u << 02) { x -= 5u << 02; y |= 1u << 02; } goto L0; } else { y = 1u << 05; x -= 1u << 06; if (x >= 5u << 04) { x -= 5u << 04; y |= 1u << 04; } goto L1; } else // slower (on my pc): .... y = 3u << 04; } goto L1; } if (x < 1u << 12) if (x < 1u << 10) { y = 1u << 07; x -= 1u << 08; if (x >= 5u << 06) { x -= 5u << 06; y |= 1u << 06; } goto L2; } else { y = 1u << 09; x -= 1u << 10; if (x >= 5u << 08) { x -= 5u << 08; y |= 1u << 08; } goto L3; } else if (x < 1u << 14) { y = 1u << 11; x -= 1u << 12; if (x >= 5u << 10) { x -= 5u << 10; y |= 1u << 10; } goto L4; } else { y = 1u << 13; x -= 1u << 14; if (x >= 5u << 12) { x -= 5u << 12; y |= 1u << 12; } goto L5; } else if (x < 1u << 24) if (x < 1u << 20) if (x < 1u << 18) { y = 1u << 15; x -= 1u << 16; if (x >= 5u << 14) { x -= 5u << 14; y |= 1u << 14; } goto L6; } else { y = 1u << 17; x -= 1u << 18; if (x >= 5u << 16) { x -= 5u << 16; y |= 1u << 16; } goto L7; } else if (x < 1u << 22) { y = 1u << 19; x -= 1u << 20; if (x >= 5u << 18) { x -= 5u << 18; y |= 1u << 18; } goto L8; } else { y = 1u << 21; x -= 1u << 22; if (x >= 5u << 20) { x -= 5u << 20; y |= 1u << 20; } goto L9; } else if (x < 1u << 28) if (x < 1u << 26) { y = 1u << 23; x -= 1u << 24; if (x >= 5u << 22) { x -= 5u << 22; y |= 1u << 22; } goto La; } else { y = 1u << 25; x -= 1u << 26; if (x >= 5u << 24) { x -= 5u << 24; y |= 1u << 24; } goto Lb; } else if (x < 1u << 30) { y = 1u << 27; x -= 1u << 28; if (x >= 5u << 26) { x -= 5u << 26; y |= 1u << 26; } goto Lc; } else { y = 1u << 29; x -= 1u << 30; if (x >= 5u << 28) { x -= 5u << 28; y |= 1u << 28; } } z = y | 1u << 26; y /= 2; if (x >= z) { x -= z; y |= 1u << 26; } Lc: z = y | 1u << 24; y /= 2; if (x >= z) { x -= z; y |= 1u << 24; } Lb: z = y | 1u << 22; y /= 2; if (x >= z) { x -= z; y |= 1u << 22; } La: z = y | 1u << 20; y /= 2; if (x >= z) { x -= z; y |= 1u << 20; } L9: z = y | 1u << 18; y /= 2; if (x >= z) { x -= z; y |= 1u << 18; } L8: z = y | 1u << 16; y /= 2; if (x >= z) { x -= z; y |= 1u << 16; } L7: z = y | 1u << 14; y /= 2; if (x >= z) { x -= z; y |= 1u << 14; } L6: z = y | 1u << 12; y /= 2; if (x >= z) { x -= z; y |= 1u << 12; } L5: z = y | 1u << 10; y /= 2; if (x >= z) { x -= z; y |= 1u << 10; } L4: z = y | 1u << 08; y /= 2; if (x >= z) { x -= z; y |= 1u << 08; } L3: z = y | 1u << 06; y /= 2; if (x >= z) { x -= z; y |= 1u << 06; } L2: z = y | 1u << 04; y /= 2; if (x >= z) { x -= z; y |= 1u << 04; } L1: z = y | 1u << 02; y /= 2; if (x >= z) { x -= z; y |= 1u << 02; } L0: return x > y ? y / 2 | 1u : y / 2; } private static Stopwatch sw = new Stopwatch(); static void Main() { Console.WriteLine(sro32(4)); Console.WriteLine(sro32(40)); Console.WriteLine(sro32(400)); Console.WriteLine(sro32(4000)); Console.WriteLine(sro32(40000)); Console.WriteLine(sro32(400000)); Console.WriteLine(sro32(4000000)); Console.WriteLine(sro32(40000000)); Console.WriteLine(sro32(400000000)); Console.WriteLine(sro32(4000000000)); test_perfect_squares(); time_sro32(); sro32_all(); // ~1 minute Console.ReadLine(); } private static void test_perfect_squares() { uint n = 0, a = 0, b = 1; do { if (sro32(a) != n) Console.WriteLine("WRONG"); a += b; b += 2; } while (n++ < 65535); Console.WriteLine("Perfect squares tested"); } private static void time_sro32() { int i, j; uint x; double t; //uint[] a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65, 100, 127, 128, 255, 256, 511, 1023, 2047, 4095, 1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167, 1u << 30,1u << 30,(1u << 30) + (1u << 15), 1u << 31, 0xfffe0001u, ~0u }; Console.WriteLine("|------------|------|"); Console.WriteLine("| x | ns |"); Console.WriteLine("|------------|------|"); for (j = 0; j < a.Length; j++) { x = a[j]; sro32(x); sw.Restart(); for (i = 0; i < 10000000; i++) sro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (i = 0; i < 10000000; i++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10); } Console.WriteLine("|------------|------|"); Console.WriteLine(); } private static void sro32_all() { uint x; double t; sro32(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) sro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", (double)(t) * 1000000 / x); // 4294967295 roots in 56962 ms, mean time 13,26 ns } }
//------------------------------------------------------------ // Another one, slower, the line "t = ..." is done many times. // // private static uint sro32s(uint x) // uint y1 = 0, y2 = 0, t; int k = 15; // for (; k >= 0; k--) // { // t = y2 + (y1 << k + 1) | (1u << k * 2); // if (x >= t) { y2 = t; y1 |= 1u << k; } // } // return y1; //-----------------------------------------------------------
Subscribe to:
Posts (Atom)




