/* Source code of System.Random is available at: CodePlex/Random.cs System.Random is fast, basically a new random number is generated by a subtraction of two numbers. A straightforward copy of the source code is two times slower compared to using rand.Next directly. A way to speed it up: the codeplex code uses a multiplication by a double type. The multiplication can be replaced by a shift and an "if" statement when 30 random bits are needed, very occasionally an addition "+ 1" (and an "& 1") are necessary, that's what "Next30()" does in, voila: Fast Random Range (CodePlex) Other improvements over the previous version: -a 64 bits buffer, instead of a 32 bits one. Operations like "uint + uint" are as fast as "ulong + uint", same for: " - , ^ , | , << , >> " -Random numbers were computed bit by bit, now the bits are taken at once, some bit twiddling recycles unused random bits. Finally: A probably very true scaled (pseudo) random number generator, based on the .Net4 RNGCryptoServiceProvider class: ".. rng = new RNGCryptoServiceProvider". The "rng.Getbytes(..)" method fills a byte array, one byte as fast as 40 bytes, 40 bytes seems to be an optimum. "Buffer.BlockCopy" copies the byte array to a uint array. The uint array is used to fill a 64 random bits buffer, etc. "uint x = g_uint_c(uint u)" gives a random value for x, such that: 0 <= x <= u <= uint.MaxValue Results: 10^7 times rand.Next() takes: 107 ms (~11 ns per random number) 10^7 times rand.Next(int someValue) takes: 236 ms (~24 ns ...) |--------|-------------------------| rand.Next() 11 ns | | ms 10^7 * | rand.Next(i) 24 ns | |-----------|-------------| gen_uint(u) 50 ns, mean ~35 ns | u |gen_uint(u)| g_uint_c(u) | g_uint_c(u) 280 ns, mean ~250 ns |--------|-----|-----|------|------| | 0 | 24 | | 24 | | | 1 | | 73 | | 139 | | 2 | 124 | | 296 | | | 3 | | 78 | | 211 | | 4 | 242 | | 518 | | | 7 | | 83 | | 282 | | 8 | 290 | | 680 | | | 2^7-1 | | 102 | | 562 | | 2^7-0 | 380 | | 1079 | | | 2^15-1 | | 130 | | 1123 | | 2^15-0 | 414 | | 1634 | | | 2^30-1 | | 193 | | 2164 | | 2^30-0 | 494 | | 2672 | | | 2^31-1 | | 210 | | 2251 | | 2^31-0 | 461 | | 2747 | | | 2^32-1 | | 211 | | 2307 | |--------|-----|-----|------|------| http://www.cacert.at/cgi-bin/rngresults find: "P_P" , hover mouse over "Net4 System.Random" for the code. Random ~ gen_uint(u) CSP ~ g_uint_c(u) |--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------| | Gene- | Speed | Upload | Entropy | Birthday | Matrix | 6*8 Ma- | Minimum | Random | Squeeze | Overlap- | | rator | (MiB/s) | Size | (->8) | Spacing | Ranks | trix R | Distance | Spheres | | ping Sums| |--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------| | Random | 370 | 17 MiB | 7.999989 | 0.000000 | 0.392 | 0.136 | 0.063920 | 0.905097 | 0.098440 | 0.005467 | | CSP | 34 | 15 MiB | 7.999987 | 0.032102 | 0.084 | 0.942 | 0.894277 | 0.036221 | 0.706538 | 0.023523 | |--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------| */ using System; using System.Diagnostics; using System.Security.Cryptography; class scaled_random_uint { private static ulong buf = 0; // 64 random bits buffer private static int av = 0; // available bits in buf private static uint g_u = 0; // copy of u private static int g_nd = 0; // nd (needed) bits = highest set bit of u private static int g_s = 0; // shift private static bool ones = false; // true if u = 10, 110, 1110, ... private static uint g_uint_c(uint u) { uint x; int s; if (u == 0) return 0; if (u != g_u) { g_u = u; g_nd = nob(u); g_s = 64 - g_nd; ones = (u + 1 & u + 2) == 0; } L0: if (av < g_nd) { av += 32; buf <<= 32; buf |= next32(); } x = (uint)buf << g_s >> g_s; buf >>= g_nd; av -= g_nd; if (x <= u) return x; if (ones) goto L0; // faster when u = 2, 6, 14, ... s = nob(x ^ u) - 1; buf <<= s; buf ^= x; av += s; goto L0; } private static RNGCryptoServiceProvider rng = new RNGCryptoServiceProvider(); private static byte[] r08 = new byte[40]; // ... [400] ... ~15% faster private static uint[] r32 = new uint[10]; // ... [100] ... ,, ,, private static int i_r32 = 10; // ... = 100 ... ,, ,, private static uint next32() { if (i_r32 != 10) return r32[i_r32++]; // ... != 100... ,, ,, i_r32 = 1; rng.GetBytes(r08); Buffer.BlockCopy(r08, 0, r32, 0, 40); // ... , 400)... ,, ,, return r32[0]; } private static int nob(uint u) // number of bits, position of the highest set bit, u > 0 { return u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 : u < 1u << 03 ? 03 : 04 : u < 1u << 06 ? u < 1u << 05 ? 05 : 06 : u < 1u << 07 ? 07 : 08 : u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 : u < 1u << 11 ? 11 : 12 : u < 1u << 14 ? u < 1u << 13 ? 13 : 14 : u < 1u << 15 ? 15 : 16 : u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 : u < 1u << 19 ? 19 : 20 : u < 1u << 22 ? u < 1u << 21 ? 21 : 22 : u < 1u << 23 ? 23 : 24 : u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 : u < 1u << 27 ? 27 : 28 : u < 1u << 30 ? u < 1u << 29 ? 29 : 30 : u < 1u << 31 ? 31 : 32; } private static Stopwatch sw = new Stopwatch(); static void Main() { speed((1u << 31) - 0); Console.ReadLine(); } private static void speed(uint u) { int i, j = 10000000; double t; sw.Restart(); for (i = 0; i < j; i++) g_uint_c(u); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (i = 0; i < j; i++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.WriteLine("u: " + u); Console.WriteLine("gen_uint(u)"); Console.WriteLine(" ms: " + t); Console.WriteLine(" bits: " + (double)(nob(u)) * j); Console.WriteLine(" bits/s: {0:.000}", (double)((nob(u)) * j) / t / 1000000); Console.WriteLine(); } }
C# , System.Numerics, Multiplication, Karatsuba, Toom Cook, Division, Burnikel, Ziegler, Factorial, Luschny, Square Root, Zimmermann, Choose, Binomial Coefficient, Permutation, Combination, Eratosthenes, Primes, Fibonacci, Lucas, Pell, Catalan, Fast Random Number Generator, Overton
2013/11/17
Uniform random numbers in a range, part 2
2013/11/04
Fast Cube Root ( < 2^32 ), part 2
/* The mean time to take a cube root from a 32 bits uint was: < 13 ns new mean time: < 12 ns It is a bit faster, especially for large values, because variables are initialized properly. |--------------------------------| | cro32(x) | |-----|-----||------------|------| | x | ns || x | ns | |-----|-----||------------|------| | 0 | 3,0 || 511 | 4,6 | | 1 | 2,3 || 1023 | 5,3 | | 3 | 2,3 || 2047 | 5,6 | ___ | 7 | 2,3 || 4095 | 5,0 | \3 / | 15 | 3,3 || 32768 | 7,6 | \/ x = y | 31 | 2,6 || 1048576 | 9,0 | | 63 | 2,6 || 268435456 | 11,0 | Input: 0 <= x < 2^32 | 64 | 5,0 || 536870912 | 11,3 | Output: y, such that y^3 <= x < (y+1)^3 | 65 | 5,0 || 1070599167 | 13,0 | | 100 | 5,0 || 1070599168 | 13,0 | | 127 | 5,0 || 1073741824 | 11,3 | | 128 | 5,3 || 1073774592 | 11,3 | | 255 | 4,6 || 2147483648 | 11,0 | | 256 | 4,6 || 4294967295 | 11,7 | |-----|-----||------------|------| 4294967295 roots in 50686 ms, mean time 11,80 ns (before 12,55 ns) Another way to find roots is using a look-up table. Below "fill_cubes" fills a "cubes" array, "cro32t" does a binary search for a root. Mean time: 30 ns. I have not tried to optimize it further, I see no reason why it should be much faster. */ using System; using System.Diagnostics; class cube_root { private static uint cro32(uint x) { uint y = 4u, z = 16u, b = 61u << 21; if (x < 1u << 24) if (x < 1u << 12) if (x < 1u << 06) if (x < 1u << 03) return x == 0u ? 0u : 1u; else return x < 27u ? 2u : 3u; else if (x < 1u << 09) goto L8; else goto L7; else if (x < 1u << 18) if (x < 1u << 15) goto L6; else goto L5; else if (x < 1u << 21) goto L4; else goto L3; else if (x < 1u << 30) if (x < 1u << 27) goto L2; else { if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; b = 127u << 21; } else { x -= 1u << 27; } } else { if (x >= 27u << 27) { x -= 27u << 27; z = 144u; y = 12u; b = 469u << 21; } else { if (x >= 125u << 24) { x -= 125u << 24; z = 100u; y = 10u; b = 331u << 21; } else { x -= 1u << 30; z = 64u; y = 8u; b = 217u << 21; } } } goto M1; L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2; L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3; L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4; L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5; L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6; L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7; L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8; M1: if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4; M8: return x <= (y + z) * 3 ? y : y + 1u; } private static uint[] cubes = new uint[2048]; private static void fill_cubes() { int i = 0; uint a = 0u, b = 1u, c = 6u; do { cubes[i++] = a; a += b; b += c; c += 6u; } while (i < 1626); do cubes[i++] = ~0u; while (i < 2048); } private static uint cro32t(uint x) { uint i = 1u << 10, j = 1u << 9, u = 1u << 30; for (; j > 0; j >>= 1) { if (x < u) i -= j; else if (x > u) i += j; else return i < 1625u ? i : 1625u; u = cubes[i]; } return x < u ? i - 1 : i; } private static Stopwatch sw = new Stopwatch(); static void Main(string[] args) { cro32_all(); // ~1 minute, comment out "cro32t" , uint[] cubes, etc. // Or try seperate executables. fill_cubes(); cro32t_all(); // ~2 minutes Console.ReadLine(); } private static void cro32_all() { uint x; double t; cro32(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) cro32(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x); // 4294967295 roots in 50686 ms, mean time 11,80 ns } private static void cro32t_all() { uint x; double t; cro32t(~0u); sw.Restart(); for (x = 0; x < ~0u; x++) cro32t(x); sw.Stop(); t = sw.ElapsedMilliseconds; sw.Restart(); for (x = 0; x < ~0u; x++) ; // nada sw.Stop(); t -= sw.ElapsedMilliseconds; Console.Write(x + " roots in " + t + " ms, "); Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x); // 4294967295 roots in 129254 ms, mean time 30,09 ns } }
Subscribe to:
Posts (Atom)