Pagina's

2013/11/17

Uniform random numbers in a range, part 2

/*

Source code of System.Random is available at: CodePlex/Random.cs

System.Random is fast, basically a new random number is generated by a subtraction
of two numbers. A straightforward copy of the source code is two times slower 
compared to using rand.Next directly.
A way to speed it up: the codeplex code uses a multiplication by a double type. 
The multiplication can be replaced by a shift  and an "if" statement when 30 
random bits are needed, very occasionally an addition  "+ 1" (and an "& 1") are
necessary, that's what "Next30()" does in, voila:

    Fast Random Range (CodePlex)

Other improvements over the previous version:
-a 64 bits buffer, instead of a 32 bits one.
 Operations like "uint + uint" are as fast as "ulong + uint", 
 same for: " -  , ^ , | , << , >> " 
-Random numbers were computed bit by bit, now the bits are taken at once,
 some bit twiddling recycles unused random bits. 

Finally: A probably very true scaled (pseudo) random number generator, 
based on the .Net4 RNGCryptoServiceProvider class:
".. rng = new  RNGCryptoServiceProvider".
The "rng.Getbytes(..)" method fills a byte array, 
one byte as fast as 40 bytes, 40 bytes seems to be an optimum.
"Buffer.BlockCopy" copies the byte array to a uint array.
The uint array is used to fill a 64 random bits buffer, etc.
"uint x = g_uint_c(uint u)" gives a random value for x,
such that: 0 <= x <= u <= uint.MaxValue


Results:
    
    10^7 times rand.Next() takes: 107 ms (~11 ns per random number) 
    10^7 times rand.Next(int someValue) takes: 236 ms (~24 ns ...)
                                                 
    |--------|-------------------------|         rand.Next()    11 ns  
    |        |       ms 10^7 *         |         rand.Next(i)   24 ns
    |        |-----------|-------------|         gen_uint(u)    50 ns, mean  ~35 ns
    |      u |gen_uint(u)| g_uint_c(u) |         g_uint_c(u)   280 ns, mean ~250 ns   
    |--------|-----|-----|------|------|
    |      0 |  24 |     |   24 |      |          
    |      1 |     |  73 |      |  139 |          
    |      2 | 124 |     |  296 |      |           
    |      3 |     |  78 |      |  211 |       
    |      4 | 242 |     |  518 |      |     
    |      7 |     |  83 |      |  282 |               
    |      8 | 290 |     |  680 |      |      
    |  2^7-1 |     | 102 |      |  562 |              
    |  2^7-0 | 380 |     | 1079 |      |          
    | 2^15-1 |     | 130 |      | 1123 |          
    | 2^15-0 | 414 |     | 1634 |      |               
    | 2^30-1 |     | 193 |      | 2164 |
    | 2^30-0 | 494 |     | 2672 |      |
    | 2^31-1 |     | 210 |      | 2251 |
    | 2^31-0 | 461 |     | 2747 |      |
    | 2^32-1 |     | 211 |      | 2307 |
    |--------|-----|-----|------|------|


http://www.cacert.at/cgi-bin/rngresults

find: "P_P" , hover mouse over "Net4 System.Random" for the code.
 Random ~ gen_uint(u) 
    CSP ~ g_uint_c(u)

|--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
| Gene-  |  Speed  | Upload | Entropy  | Birthday | Matrix | 6*8 Ma- | Minimum  | Random   | Squeeze  | Overlap- |
| rator  | (MiB/s) |  Size  | (->8)    | Spacing  | Ranks  | trix R  | Distance | Spheres  |          | ping Sums|
|--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
| Random |   370   | 17 MiB | 7.999989 | 0.000000 |  0.392 |  0.136  | 0.063920 | 0.905097 | 0.098440 | 0.005467 |
|    CSP |    34   | 15 MiB | 7.999987 | 0.032102 |  0.084 |  0.942  | 0.894277 | 0.036221 | 0.706538 | 0.023523 |
|--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
   
*/

using System;
using System.Diagnostics;
using System.Security.Cryptography;

class scaled_random_uint
{
    private static ulong buf = 0;              // 64 random bits buffer
    private static int av = 0;                 // available bits in buf
    private static uint g_u = 0;               // copy of u
    private static int g_nd = 0;               // nd (needed) bits = highest set bit of u
    private static int g_s = 0;                // shift
    private static bool ones = false;          // true if u = 10, 110, 1110, ...

    private static uint g_uint_c(uint u)
    {
        uint x; int s;
        if (u == 0) return 0;
        if (u != g_u)
        {
            g_u = u;
            g_nd = nob(u);
            g_s = 64 - g_nd;
            ones = (u + 1 & u + 2) == 0;
        }
    L0: if (av < g_nd) 
        {
            av += 32;
            buf <<= 32;
            buf |= next32();
        }
        x = (uint)buf << g_s >> g_s;
        buf >>= g_nd;
        av -= g_nd;
        if (x <= u) return x;
        if (ones) goto L0;                     // faster when u = 2, 6, 14, ...
        s = nob(x ^ u) - 1;
        buf <<= s;
        buf ^= x;
        av += s;
        goto L0;
    }

    private static RNGCryptoServiceProvider
        rng = new RNGCryptoServiceProvider();
    private static byte[] r08 = new byte[40];       //  ... [400] ... ~15% faster
    private static uint[] r32 = new uint[10];       //  ... [100] ...  ,,    ,,
    private static int i_r32 = 10;                  //  ... = 100 ...  ,,    ,, 
                                                               
    private static uint next32()                                  
    {                                                             
        if (i_r32 != 10) return r32[i_r32++];       //  ... != 100...  ,,    ,,
        i_r32 = 1;                                                
        rng.GetBytes(r08);                                       
        Buffer.BlockCopy(r08, 0, r32, 0, 40);       //  ... , 400)...  ,,    ,,
        return r32[0];
    }

    private static int nob(uint u) // number of bits, position of the highest set bit, u > 0  
    {
        return
        u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 :
                                                                    u < 1u << 03 ? 03 : 04 :
                                                     u < 1u << 06 ? u < 1u << 05 ? 05 : 06 :
                                                                    u < 1u << 07 ? 07 : 08 :
                                      u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 :
                                                                    u < 1u << 11 ? 11 : 12 :
                                                     u < 1u << 14 ? u < 1u << 13 ? 13 : 14 :
                                                                    u < 1u << 15 ? 15 : 16 :
                       u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 :
                                                                    u < 1u << 19 ? 19 : 20 :
                                                     u < 1u << 22 ? u < 1u << 21 ? 21 : 22 :
                                                                    u < 1u << 23 ? 23 : 24 :
                                      u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 :
                                                                    u < 1u << 27 ? 27 : 28 :
                                                     u < 1u << 30 ? u < 1u << 29 ? 29 : 30 :
                                                                    u < 1u << 31 ? 31 : 32;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main()
    {
        speed((1u << 31) - 0); 
        Console.ReadLine();
    }

    private static void speed(uint u)
    {
        int i, j = 10000000; double t;
        sw.Restart();
        for (i = 0; i < j; i++) g_uint_c(u);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (i = 0; i < j; i++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.WriteLine("u: " + u);
        Console.WriteLine("gen_uint(u)");
        Console.WriteLine("     ms: " + t);
        Console.WriteLine("   bits: " + (double)(nob(u)) * j);
        Console.WriteLine(" bits/s: {0:.000}", (double)((nob(u)) * j) / t / 1000000);
        Console.WriteLine();
    }
}

2013/11/04

Fast Cube Root ( < 2^32 ), part 2

/*
The mean time to take a cube root from a 32 bits uint was: < 13 ns
                                            new mean time: < 12 ns

It is a bit faster, especially for large values, because variables are initialized properly.
       
       |--------------------------------|
       |         cro32(x)               |       
       |-----|-----||------------|------|       
       |   x |  ns ||          x |   ns |       
       |-----|-----||------------|------|
       |   0 | 3,0 ||        511 |  4,6 |       
       |   1 | 2,3 ||       1023 |  5,3 |       
       |   3 | 2,3 ||       2047 |  5,6 |               ___
       |   7 | 2,3 ||       4095 |  5,0 |           \3 /                
       |  15 | 3,3 ||      32768 |  7,6 |            \/  x = y          
       |  31 | 2,6 ||    1048576 |  9,0 |          
       |  63 | 2,6 ||  268435456 | 11,0 |           Input: 0 <= x < 2^32
       |  64 | 5,0 ||  536870912 | 11,3 |          Output: y, such that y^3 <= x < (y+1)^3
       |  65 | 5,0 || 1070599167 | 13,0 |       
       | 100 | 5,0 || 1070599168 | 13,0 |
       | 127 | 5,0 || 1073741824 | 11,3 |
       | 128 | 5,3 || 1073774592 | 11,3 |
       | 255 | 4,6 || 2147483648 | 11,0 |
       | 256 | 4,6 || 4294967295 | 11,7 |
       |-----|-----||------------|------|
       
       4294967295 roots in 50686 ms, mean time 11,80 ns (before 12,55 ns)
       
Another way to find roots is using a look-up table. Below "fill_cubes" fills a "cubes" array,
"cro32t" does a binary search for a root. Mean time: 30 ns.
I have not tried to optimize it further, I see no reason why it should be much faster.
*/

using System;
using System.Diagnostics;
class cube_root
{
    private static uint cro32(uint x)
    {
        uint y = 4u, z = 16u, b = 61u << 21;
        if (x < 1u << 24)
            if (x < 1u << 12)
                if (x < 1u << 06)
                    if (x < 1u << 03)
                        return x == 0u ? 0u : 1u;
                    else
                        return x < 27u ? 2u : 3u;
                else
                    if (x < 1u << 09) goto L8; else goto L7;
            else
                if (x < 1u << 18)
                    if (x < 1u << 15) goto L6; else goto L5;
                else
                    if (x < 1u << 21) goto L4; else goto L3;
        else
            if (x < 1u << 30)
                if (x < 1u << 27) goto L2;
                else
                {
                    if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; b = 127u << 21; }
                    else { x -= 1u << 27; }
                }
            else
            {
                if (x >= 27u << 27) { x -= 27u << 27; z = 144u; y = 12u; b = 469u << 21; }
                else
                {
                    if (x >= 125u << 24) { x -= 125u << 24; z = 100u; y = 10u; b = 331u << 21; }
                    else { x -= 1u << 30; z = 64u; y = 8u; b = 217u << 21; }
                }
            }
        goto M1;

    L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2;
    L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3;
    L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4;
    L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5;
    L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6;
    L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7;
    L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8;

    M1: if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M8: return x <= (y + z) * 3 ? y : y + 1u;
    }

    private static uint[] cubes = new uint[2048];
    private static void fill_cubes()
    {
        int i = 0; uint a = 0u, b = 1u, c = 6u;
        do
        {
            cubes[i++] = a;
            a += b; b += c; c += 6u;
        }
        while (i < 1626);
        do cubes[i++] = ~0u;
        while (i < 2048);
    }
    private static uint cro32t(uint x)
    {
        uint i = 1u << 10, j = 1u << 9, u = 1u << 30;
        for (; j > 0; j >>= 1)
        {
            if (x < u) i -= j;
            else if (x > u) i += j;
            else return i < 1625u ? i : 1625u;
            u = cubes[i];
        }
        return x < u ? i - 1 : i;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main(string[] args)
    {
        cro32_all();        // ~1 minute, comment out "cro32t" , uint[] cubes, etc.
                            //  Or try seperate executables.

        fill_cubes();
        cro32t_all();       // ~2 minutes
        Console.ReadLine();
    }
    private static void cro32_all()
    {
        uint x; double t;
        cro32(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) cro32(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x);
        // 4294967295 roots in 50686 ms, mean time 11,80 ns
    }
    private static void cro32t_all()
    {
        uint x; double t;
        cro32t(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) cro32t(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x);
        // 4294967295 roots in 129254 ms, mean time 30,09 ns
    }
}

2013/10/19

Uniform pseudo random numbers in a range

/*

Random number generators (RNG) are divided in two categories,
hardware RNG, that provide “true” random numbers, and algorithmic RNG, that generate
pseudo random numbers (PRNG). Both types usually generate random numbers "X" as independent
uniform samples in a range 0, . . . 2^b − 1, with b = 8, 16, 32 or b = 64. In applications,
it is instead sometimes desirable to draw random numbers as independent uniform samples "U"
in a range " 1, . . . M ", where moreover "M" may change between drawings.
Transforming "X" to "U" , or many "X" to many "U" is sometimes known as scaling.

Some text is copied from other sources. You find them in "References".
I found it quite hard to explain some of the underlying ideas.
I am grateful to the original authors. Their words saved me a lot of time.

Limits:

    0 <= M <= 2^32 - 1 = uint.MaxValue  
    0 <= U <= M

UPDATE: part 2 

Example: 
Suppose a PRNG produces one random bit each time it is called, and we have: M = 5 ,
binary: M = 101. Call the generator three times to get three successive bits. 
Result: 0 <= U <= 7, binary: 000 <= U <= 111. If "U > M" , than we take the next three bits
from the generator, and repeat to do so until "U <= M".

Another way to find a valid value for "U".
-Take the first, the most significant, bit from the generator.
If it's zero, take two other bits from the generator,
and return a valid value for "U": 000 or 001 or 010 or 011.
If the first bit isn't zero, so it's one, 
take the second bit from the generator.
If the second bit is zero, take the third bit from the generator,
and return a valid value for "U": 100 or 101.
Finally: the first and the second bit are both set,
no need to take a third bit from the generator,
a random bit is saved, we have to try again. 
-Take the first, the most significant, bit from the generator...

              /  U = 0..  return 000 or 001 or 010 or 011
             /
         ---<    U = 10.  return 100 or 101
        |    \ 
        |     \  U = 11   --try again--
        |                              |
        |                              |
         ----<<--------<<--------<<---- 

The PRNG used: .Net 4.0 System.Random
It might have a bug: The period is not guaranteed to be longer than: 2^55-1
                     (I do have some doubts, but no proof)
 
Some important properties of System.Random:

    Random Constructors:

        Random()          Initializes a new instance of the Random class, using a time-dependent value.
                   
        Random(int seed)  Initializes a new instance of the Random class, using the specified seed.

    To improve performance, create one Random object to generate many random numbers over time,
    instead of repeatedly creating new Random objects to generate one random number, so:

        private static Random rand = new Random()

    Random Methods:

        rand.Next()                    Returns a positive random number, greater than or equal to zero,
                                       less than int.MaxValue

        rand.Next(int max)             Returns a positive random number, greater than or equal to zero,
                                       less than max ( max >= 0 ) , ........ rand.Next(0) returns zero 
 
        rand.Next(int min, int max)    Returns a positive random number, greater than or equal to min,
                                       less than max ( max >= min ) , ...... 

    rand.Next() is about twice as fast as rand.Next(int max) , 
    rand.Next(int max) is almost as fast as rand.Next(int min, int max),
    especially when you use something like: return min + rand.Next(max - min)

    It's impossible to create more than 30 random bits at once.
    To create those 30 bits, use: rand.Next(1 << 30)  

Various sources warn about the lower bits of numbers generated by PRNG's, they shouldn't be random.
One way to examine a PRNG is to create a visualisation of the numbers it produces.
Humans are really good at spotting patterns, and visualisation allows you to use your eyes and 
brain directly for this purpose. While you shouldn't consider this type of approach an exhaustive 
or formal analysis, it is a nice and quick way to get a rough impression of a given generator's
performance. The background of the image at the top looks pretty random, no recognizable patterns.
It was created with the code at the very bottom: "random_bitmap()"

Description of the code:

gen_nd_bits(int nd) returns the needed amount of random bits. It uses a 32 bits buffer.
The buffer is replenished when necessary.

gen_uint_fast(uint u) returns a random value between zero (incl.) and u (incl.)
It calls gen_nd_bits(int nd) . When the value is too large another call is done.

gen_uint_eff(uint u) uses random bits more efficiently.
It works bit by bit, from most to least significant bit. When it becomes clear that the (partial)
formed value is smaller than the upper limit ( uint u ) , remaining needed random bits are
added relatively fast. When the (partial) formed value is too large another call is done.

On my trustworthy PC the average (full range) results are:

    |-----------------------|---------------|
    |     function          | * 10^9 bits/s | 
    |-----------------------|---------------|
    |    rand.Next()        |      2.9      | <-- 1 CD in 2 seconds!
    |    rand.Next(1 << 30) |      1.5      |  
    |   gen_nd_bits(int nd) |      1.2      | 
    | gen_uint_fast(uint u) |      0.9      | 
    |  gen_uint_eff(uint u) |      0.7      |
    |-----------------------|---------------|

The efficiency, random bits used per bit: 

      gen_uint_fast(uint u)   1.386 bits/bit, remarkably close to ln(4)
       gen_uint_eff(uint u)   1.044 bits/bit

References:

    José's Daylight Dices 
    RANDOM.ORG 
    Ask Dr. Math, Probability and Random Numbers
    Bit recycling for scaling random number generators
    Math.NET Numerics
    MSDN
    Bug ? System.Random (january 2011)

*/

using System;
using System.Drawing;                          // add a ref!
class scaled_random
{
    private static Random rand = new Random();
    private static uint buf = 0;               // random bits buffer
    private static int av = 0;                 // available bits in buf
    private static uint g_u = 0;               // copy of u
    private static int g_nd = 0;               // copy of nd (needed) = highest set bit of u

    private static uint gen_nd_bits(int nd)    // generate nd bits, 0 <= nd <= 32
    {
        uint x;
        if (nd == 0) return 0;
        if (av <= 2)                           // buffer should contain at least 2 bits, other-
        {                                      // wise there may be too few bits when nd > 30 
            buf <<= 30;
            buf |= (uint)rand.Next(1 << 30);
            av += 30;
        }
        if (av < nd)
        {
            x = buf;
            buf = (uint)rand.Next(1 << 30);
            nd -= av;
            x <<= nd;
            x |= buf << 32 - nd >> 32 - nd;
            buf >>= nd;
            av = 30 - nd;
        }
        else
        {
            x = buf << 32 - nd >> 32 - nd;
            buf >>= nd;
            av -= nd;
        }
        return x;
    }

    private static uint gen_uint_fast(uint u)  // 0 <= returned value <= u  
    {
        uint x;
        if (u == 0) return 0;
        if (u != g_u)                          // if u changes, g_nd might change 
        {
            g_u = u;
            g_nd = nob(u);                     // needed: number of bits of u   
        }
    L0: x = gen_nd_bits(g_nd);
        if (x <= u) return x;
        goto L0;
    }

    private static bool ones = false;          // true if u is (binary) 1, 11, 111, 1111, ...
    private static uint gen_uint_eff(uint u)   // 0 <= returned value <= u  
    {
        uint x; int nd;
        if (u == 0) return 0;
        if (u != g_u)
        {
            g_u = u;
            g_nd = nob(u);
            ones = (u + 1 & u) == 0;
        }
        if (ones) return gen_nd_bits(g_nd);
    L0: nd = g_nd - 1;
        if (gen_nd_bits(1) == 0) return gen_nd_bits(nd);
        x = 1u << nd;
        nd--;
        while (nd > 0)
        {
            x |= gen_nd_bits(1) << nd;
            if (x >> nd < u >> nd) return x | gen_nd_bits(nd);
            if (x > u) goto L0;
            nd--;
        }
        x |= gen_nd_bits(1);
        if (x <= u) return x;
        goto L0;
    }

    private static int nob(uint u) // number of bits, position of the highest set bit, u > 0  
    {                              // afaik fastest way in C#
        return
        u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 :
                                                                    u < 1u << 03 ? 03 : 04 :
                                                     u < 1u << 06 ? u < 1u << 05 ? 05 : 06 :
                                                                    u < 1u << 07 ? 07 : 08 :
                                      u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 :
                                                                    u < 1u << 11 ? 11 : 12 :
                                                     u < 1u << 14 ? u < 1u << 13 ? 13 : 14 :
                                                                    u < 1u << 15 ? 15 : 16 :
                       u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 :
                                                                    u < 1u << 19 ? 19 : 20 :
                                                     u < 1u << 22 ? u < 1u << 21 ? 21 : 22 :
                                                                    u < 1u << 23 ? 23 : 24 :
                                      u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 :
                                                                    u < 1u << 27 ? 27 : 28 :
                                                     u < 1u << 30 ? u < 1u << 29 ? 29 : 30 :
                                                                    u < 1u << 31 ? 31 : 32;
    }

    static void Main()
    {
        test_gen_uint_fast(17);
        test_gen_uint_eff(54);
        test_4_buckets_fast(123456789);
        test_4_buckets_eff(~0u);
        random_bitmap();
    }

    private static void test_gen_uint_fast(uint u)
    {
        int i; int[] c = new int[u + 1];
        for (i = 0; i < 1000000; i++) c[gen_uint_fast(u)]++;
        for (i = 0; i <= u; i++) Console.WriteLine("{0,3}{1,7}", i, c[i]);
        Console.ReadLine(); Console.Clear();
    }
    private static void test_gen_uint_eff(uint u)
    {
        int i; int[] c = new int[u + 1];
        for (i = 0; i < 1000000; i++) c[gen_uint_eff(u)]++;
        for (i = 0; i <= u; i++) Console.WriteLine("{0,3}{1,7}", i, c[i]);
        Console.ReadLine(); Console.Clear();
    }
    private static void test_4_buckets_fast(uint u) // rounding errors!
    {
        uint x; int i, c0 = 0, c1 = 0, c2 = 0, c3 = 0;
        for (i = 0; i < 1000000; i++)
        {
            x = gen_uint_fast(u);
            if (x < u / 4) c0++;
            else if (x < u / 2) c1++;
            else if (x < u / 2 + u / 4) c2++; else c3++;
        }
        Console.WriteLine(c0); Console.WriteLine(c1);
        Console.WriteLine(c2); Console.WriteLine(c3);
        Console.ReadLine(); Console.Clear();
    }
    private static void test_4_buckets_eff(uint u)
    {
        uint x; int i, c0 = 0, c1 = 0, c2 = 0, c3 = 0;
        for (i = 0; i < 1000000; i++)
        {
            x = gen_uint_eff(u);
            if (x < u / 4) c0++;
            else if (x < u / 2) c1++;
            else if (x < u / 2 + u / 4) c2++; else c3++;
        }
        Console.WriteLine(c0); Console.WriteLine(c1);
        Console.WriteLine(c2); Console.WriteLine(c3);
        Console.ReadLine(); Console.Clear();
    }

    private static void random_bitmap()
    {
        int xmax = 900;
        int ymax = 900;
        Bitmap bmp = new Bitmap(xmax + 1, ymax + 1);
        int x, y, z;
        for (x = 0; x <= xmax; x++)
        {
            for (y = 0; y <= ymax; y++)
            {
                z = 7 & rand.Next(1 << 30);
                switch (z)
                {
                    case 0: bmp.SetPixel(x, y, Color.Yellow); break;
                    case 1: bmp.SetPixel(x, y, Color.Orange); break;
                    case 2: bmp.SetPixel(x, y, Color.Purple); break;
                    case 3: bmp.SetPixel(x, y, Color.White); break;
                    case 4: bmp.SetPixel(x, y, Color.Green); break;
                    case 5: bmp.SetPixel(x, y, Color.Black); break;
                    case 6: bmp.SetPixel(x, y, Color.Blue); break;
                    case 7: bmp.SetPixel(x, y, Color.Red); break;
                }
            }
        }
        bmp.Save(xmax.ToString() + ".bmp");
    }
}

2013/09/26

Fast Cube Root ( < 2^32 )

/*                                                                                                            **
 The time to take a cube root from a 32 bits uint was: < 26 ns (mean: < 22 ns)
                                             new time: < 14 ns (mean: < 13 ns)
 
 The trick used for small square roots is used again: "unroll the loop".
 First step: A binary search, the "if" statements, large values ( >= 1u << 24 )
 are found relatively faster, small values ( < 64 ) are handled during the search.
 Second step: A jump into the unrolled loop, the "labels".
 
      |--------------------------------| 
      |         cro32(x)               |                         
      |-----|-----||------------|------|
      |   x |  ns ||          x |   ns |
      |-----|-----||------------|------|
      |   0 | 2,4 ||        511 |  4,6 |
      |   1 | 2,3 ||       1023 |  5,3 |
      |   3 | 2,3 ||       2047 |  5,3 |
      |   7 | 2,3 ||       4095 |  5,0 |
      |  15 | 3,6 ||      32768 |  7,2 |
      |  31 | 3,0 ||    1048576 |  8,5 |
      |  63 | 3,0 ||  268435456 | 12,0 |
      |  64 | 5,0 ||  536870912 | 12,3 |     heaviest case,
      |  65 | 5,0 || 1070599167 | 13,7 | <== 1070599167 is the
      | 100 | 5,0 || 1070599168 | 13,7 |     3th power of 1023, 
      | 127 | 5,0 || 1073741824 | 12,3 |     binary: 1111111111
      | 128 | 5,0 || 1073774592 | 12,0 |
      | 255 | 4,6 || 2147483648 | 12,3 |
      | 256 | 4,6 || 4294967295 | 12,0 |
      |-----|-----||------------|------|

       4294967295 roots in 53884 ms, mean time 12,55 ns, faster than square root!
         UPDATE 2013/11/04 50686 ms, mean time 11,80 ns, see: part 2
 
 ** Original version: Figure 2: Architecture of the iteration (alternative A)
    A digit-by-digit algorithm for radix-2 cube root and its implementation (2004)
    J.-A.Pineiro, J.D.Bruguera, L.Ciminiera, P.Montuschi
  
*/
using System;
using System.Diagnostics;

class cro_32
{
    private static uint cro32(uint x)
    {
        uint y = 4u, z = 16u, b = 0u;
        if (x < 1u << 24)
            if (x < 1u << 12)
                if (x < 1u << 06)
                    if (x < 1u << 03)
                        return x == 0u ? 0u : 1u;
                    else
                        return x < 27u ? 2u : 3u;
                else
                    if (x < 1u << 09) goto L8; else goto L7;
            else
                if (x < 1u << 18)
                    if (x < 1u << 15) goto L6; else goto L5;
                else
                    if (x < 1u << 21) goto L4; else goto L3;
        else
            if (x < 1u << 30)
                if (x < 1u << 27) goto L2; else goto L1;

        if (x >= 27u << 27) { x -= 27u << 27; z = 36u; y = 6u; } else { x -= 1u << 30; } goto M0;
    L1: if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; } else { x -= 1u << 27; } goto M1;
    L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2;
    L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3;
    L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4;
    L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5;
    L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6;
    L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7;
    L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8;

    M0: if (x >= 61u << 24) { x -= 61u << 24; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M1: b = (y + z) * 3 + 1u << 21; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M8: return x <= (y + z) * 3 ? y : y + 1u;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main()
    {
        cro32(225);
        cro32(1070599167);
        check_perfect_cubes32();
        time_cro32();
        cro32_all();
        Console.ReadLine();
    }

    private static void check_perfect_cubes32()
    {
        uint n = 0, a = 0, b = 1, c = 6;
        while (n <= 1625)
        {
            if (n != cro32(a)) Console.WriteLine("WRONG");
            a += b; b += c; c += 6; n++;
        }
        Console.WriteLine("CHECKED PERFECT CUBES");
    }

    private static void time_cro32()
    {
        int i, j; uint x; double t;
        uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65,
                     100, 127, 128, 255, 256, 511, 1023, 2047, 4095,
                     1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167,
                     1070599168,1u << 30,(1u << 30) + (1u << 15), 1u << 31, ~0u };
        Console.WriteLine("|------------|------|");
        Console.WriteLine("|          x |  ns  |");
        Console.WriteLine("|------------|------|");
        for (j = 0; j < a.Length; j++)
        {
            x = a[j];
            cro32(x);
            sw.Restart();
            for (i = 0; i < 10000000; i++) cro32(x);
            sw.Stop();
            t = sw.ElapsedMilliseconds;
            sw.Restart();
            for (i = 0; i < 10000000; i++) ; // nada
            sw.Stop();
            t -= sw.ElapsedMilliseconds;
            Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10);
        }
        Console.WriteLine("|------------|------|");
        Console.WriteLine();
    }

    private static void cro32_all()
    {
        uint x; double t;
        cro32(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) cro32(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x);
        // 4294967295 roots in 53884 ms, mean time 12,55 ns
    }
}

2013/09/02

Fast Square Root ( < 2^32 )


/*
The time to take a square root from a 32 bits uint was: ~35 ns, new time: <16 ns (mean: <14 ns).
 
It's based on: A square root hardware algorithm from Guy L. Steele,Jr. 

The slightly optimized C# version below is nearly twice as fast. A few "goto Label" statements
are used, they are fairly easy to remove, but I doubt it will be any faster (at all).
  
             |--------------------------------|
             |        Times "cro32(x)"        |
             |      Athlon X4, XP, 2 GB       |        
             |-----|-----||------------|------|
             |   x |  ns ||          x |   ns |
             |-----|-----||------------|------|
             |   0 | 1,6 ||        511 |  4,3 |
             |   1 | 1,6 ||       1023 |  4,6 |
             |   3 | 1,6 ||       2047 |  5,0 |
             |   7 | 2,3 ||       4095 |  5,6 |           4294967295 roots in 56962 ms, 
             |  15 | 2,3 ||      32768 |  6,6 |                     mean time 13,26 ns.
             |  31 | 3,6 ||    1048576 | 10,0 |
             |  63 | 3,0 ||  268435456 | 13,7 |
             |  64 | 4,3 ||  536870912 | 11,4 |
             |  65 | 4,3 || 1070599167 | 13,0 |
             | 100 | 5,0 || 1073741824 | 15,1 |
             | 127 | 4,3 || 1073741824 | 15,1 |
             | 128 | 4,3 || 1073774592 | 15,1 |
             | 255 | 4,0 || 2147483648 | 13,1 |
             | 256 | 4,7 || 4294967295 | 15,7 |
             |-----|-----||------------|------|

*/
using System;                                                     
using System.Diagnostics;                                         
                                                                  
class Square_Root                                                 
{                                                                 
    private static uint sro32(uint x)
    {
        uint y, z;
        if (x < 1u << 16)
            if (x < 1u << 08)
                if (x < 1u << 04) return x < 1u << 02 ? x + 3u >> 2 : x + 15u >> 3;
                else
                    if (x < 1u << 06)
                    { y = 1u << 03; x -= 1u << 04; if (x >= 5u << 02) { x -= 5u << 02; y |= 1u << 02; } goto L0; }
                    else
                    { y = 1u << 05; x -= 1u << 06; if (x >= 5u << 04) { x -= 5u << 04; y |= 1u << 04; } goto L1; }
            else                                             // slower (on my pc): .... y = 3u << 04; } goto L1; }
                if (x < 1u << 12)
                    if (x < 1u << 10)
                    { y = 1u << 07; x -= 1u << 08; if (x >= 5u << 06) { x -= 5u << 06; y |= 1u << 06; } goto L2; }
                    else
                    { y = 1u << 09; x -= 1u << 10; if (x >= 5u << 08) { x -= 5u << 08; y |= 1u << 08; } goto L3; }
                else
                    if (x < 1u << 14)
                    { y = 1u << 11; x -= 1u << 12; if (x >= 5u << 10) { x -= 5u << 10; y |= 1u << 10; } goto L4; }
                    else
                    { y = 1u << 13; x -= 1u << 14; if (x >= 5u << 12) { x -= 5u << 12; y |= 1u << 12; } goto L5; }
        else
            if (x < 1u << 24)
                if (x < 1u << 20)
                    if (x < 1u << 18)
                    { y = 1u << 15; x -= 1u << 16; if (x >= 5u << 14) { x -= 5u << 14; y |= 1u << 14; } goto L6; }
                    else
                    { y = 1u << 17; x -= 1u << 18; if (x >= 5u << 16) { x -= 5u << 16; y |= 1u << 16; } goto L7; }
                else
                    if (x < 1u << 22)
                    { y = 1u << 19; x -= 1u << 20; if (x >= 5u << 18) { x -= 5u << 18; y |= 1u << 18; } goto L8; }
                    else
                    { y = 1u << 21; x -= 1u << 22; if (x >= 5u << 20) { x -= 5u << 20; y |= 1u << 20; } goto L9; }
            else
                if (x < 1u << 28)
                    if (x < 1u << 26)
                    { y = 1u << 23; x -= 1u << 24; if (x >= 5u << 22) { x -= 5u << 22; y |= 1u << 22; } goto La; }
                    else
                    { y = 1u << 25; x -= 1u << 26; if (x >= 5u << 24) { x -= 5u << 24; y |= 1u << 24; } goto Lb; }
                else
                    if (x < 1u << 30)
                    { y = 1u << 27; x -= 1u << 28; if (x >= 5u << 26) { x -= 5u << 26; y |= 1u << 26; } goto Lc; }
                    else
                    { y = 1u << 29; x -= 1u << 30; if (x >= 5u << 28) { x -= 5u << 28; y |= 1u << 28; } }

        z = y | 1u << 26; y /= 2; if (x >= z) { x -= z; y |= 1u << 26; }
    Lc: z = y | 1u << 24; y /= 2; if (x >= z) { x -= z; y |= 1u << 24; }
    Lb: z = y | 1u << 22; y /= 2; if (x >= z) { x -= z; y |= 1u << 22; }
    La: z = y | 1u << 20; y /= 2; if (x >= z) { x -= z; y |= 1u << 20; }
    L9: z = y | 1u << 18; y /= 2; if (x >= z) { x -= z; y |= 1u << 18; }
    L8: z = y | 1u << 16; y /= 2; if (x >= z) { x -= z; y |= 1u << 16; }
    L7: z = y | 1u << 14; y /= 2; if (x >= z) { x -= z; y |= 1u << 14; }
    L6: z = y | 1u << 12; y /= 2; if (x >= z) { x -= z; y |= 1u << 12; }
    L5: z = y | 1u << 10; y /= 2; if (x >= z) { x -= z; y |= 1u << 10; }
    L4: z = y | 1u << 08; y /= 2; if (x >= z) { x -= z; y |= 1u << 08; }
    L3: z = y | 1u << 06; y /= 2; if (x >= z) { x -= z; y |= 1u << 06; }
    L2: z = y | 1u << 04; y /= 2; if (x >= z) { x -= z; y |= 1u << 04; }
    L1: z = y | 1u << 02; y /= 2; if (x >= z) { x -= z; y |= 1u << 02; }
    L0: return x > y ? y / 2 | 1u : y / 2;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main()
    {
        Console.WriteLine(sro32(4));
        Console.WriteLine(sro32(40));
        Console.WriteLine(sro32(400));
        Console.WriteLine(sro32(4000));
        Console.WriteLine(sro32(40000));
        Console.WriteLine(sro32(400000));
        Console.WriteLine(sro32(4000000));
        Console.WriteLine(sro32(40000000));
        Console.WriteLine(sro32(400000000));
        Console.WriteLine(sro32(4000000000));
        test_perfect_squares();
        time_sro32();
        sro32_all(); // ~1 minute                                 
        Console.ReadLine();
    }
    private static void test_perfect_squares()
    {
        uint n = 0, a = 0, b = 1;
        do
        {
            if (sro32(a) != n) Console.WriteLine("WRONG");
            a += b;
            b += 2;
        }
        while (n++ < 65535);
        Console.WriteLine("Perfect squares tested");
    }
    private static void time_sro32()
    {
        int i, j; uint x; double t;
        //uint[] a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
        uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65,
                     100, 127, 128, 255, 256, 511, 1023, 2047, 4095,
                     1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167,
                     1u << 30,1u << 30,(1u << 30) + (1u << 15), 1u << 31, 0xfffe0001u, ~0u };
        Console.WriteLine("|------------|------|");
        Console.WriteLine("|          x |  ns  |");
        Console.WriteLine("|------------|------|");
        for (j = 0; j < a.Length; j++)
        {
            x = a[j];
            sro32(x);
            sw.Restart();
            for (i = 0; i < 10000000; i++) sro32(x);
            sw.Stop();
            t = sw.ElapsedMilliseconds;
            sw.Restart();
            for (i = 0; i < 10000000; i++) ; // nada
            sw.Stop();
            t -= sw.ElapsedMilliseconds;
            Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10);
        }
        Console.WriteLine("|------------|------|");
        Console.WriteLine();
    }
    private static void sro32_all()
    {
        uint x; double t;
        sro32(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) sro32(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns",
                          (double)(t) * 1000000 / x);
        // 4294967295 roots in 56962 ms, mean time 13,26 ns
    }
}
//------------------------------------------------------------
// Another one, slower, the line "t = ..." is done many times.
//
// private static uint sro32s(uint x)
//     uint y1 = 0, y2 = 0, t; int k = 15;
//     for (; k >= 0; k--)
//     {
//         t = y2 + (y1 << k + 1) | (1u << k * 2);
//         if (x >= t) { y2 = t; y1 |= 1u << k; }
//     }
//     return y1;
//-----------------------------------------------------------