Pagina's

2014/07/28

Sum of all divisors of all positive integers <= n.

/*
Example:  N  D        SD  SSD  A024916
          1  1         1    1
          2  1,2       3    4
          3  1,3       4    8
          4  1,2,4     7   15
          5  1,5       6   21
          6  1,2,3,6  12   33
  

Sum of all divisors of all positive integers <= 10^n.

                      A072692 as a simple table
 
     n                       a(n)                                  time
                                                          SSD               SSD3
        
     0                                           1  00:00:00.0000061
     1                                          87  00:00:00.0000058
     2                                        8299  00:00:00.0000092
     3                                      823081  00:00:00.0000215
     4                                    82256014  00:00:00.0000555
     5                                  8224740835  00:00:00.0002824
     6                                822468118437  00:00:00.0007623
     7                              82246711794796  00:00:00.0022483
     8                            8224670422194237  00:00:00.0075755
     9                          822467034112360628  00:00:00.0247891
    10                        82246703352400266400  00:00:00.1029980  00:00:00.0536221
    11                      8224670334323560419029  00:00:00.3310292
    12                    822467033425357340138978  00:00:01.0792704  00:00:00.5139608
    13                  82246703342420509396897774  00:00:03.3665267
    14                8224670334241228180927002517  00:00:10.7148327  00:00:05.1645348  
    15              822467033424114009326065894639  00:00:34.2567848
    16            82246703342411333689227187822414  00:01:49.7741595  00:00:53.1228885
    17          8224670334241132270081671519064067  00:05:55.3252935
    18        822467033424113219363487627735401433  00:19:50.2083007
    19      82246703342411321831834750379375676781  01:12:36.6691244
    20    8224670334241132182459113152714495956789  04:38:51.4847697
    21  822467033424113218236966661186847524013409  15:19:44.3197506


 AMD Athlon II X4 640, 3 GHz, 2 GB, XP  
  
    N=10^16     time(s)   CPU Usage(%)
      SSD(N)     110        25 
     SSD3(N)      53        60
     FSSD(N)      92        25
    FSSD3(N)      41        60
    FSSD4(N)      33        80
*/

using System;
using System.Diagnostics;
using System.Threading.Tasks;
using Xint = System.Numerics.BigInteger;
class A024916
{
    static Xint SSD(Xint N)
    {
        Xint D = 1, Q = N, S = 0;
        while (D < Q)
        {
            S += D++ * (Q - D);
            S += Q++ * Q / 2;
            Q = N / D;
        }
        S += Q++ * Q / 2;
        S -= D-- * D-- * D / 6;
        return S;
    }

    static void Main()
    {
        var sw = new Stopwatch();
        int p = 0;
        Xint N = 1, S;
    L0: sw.Restart();
        S = SSD(N);
        sw.Stop();
        Console.WriteLine(p + "  " + S + "  " + sw.Elapsed);
        p++;
        N *= 10;
        goto L0;
    }

    private static Xint[] SA, DA, QA;

    static Xint SSD3(Xint N)
    {
        SA = new Xint[3];
        DA = new Xint[3];
        QA = new Xint[3];
        Parallel.For(0, 3, (int d) => SSDP3(N, d));
        Xint S = SA[0] + SA[1] + SA[2],
             D = DA[0],
             Q = QA[0];
        if (DA[1] < D)
        {
            D = DA[1];
            Q = QA[1];
        }
        if (DA[2] < D)
        {
            D = DA[2];
            Q = QA[2];
        }
        S += Q++ * Q / 2;
        S -= D-- * D-- * D / 6;
        return S;
    }

    static void SSDP3(Xint N, int d)
    {
        Xint D = d + 1, Q = N / D, S = 0;
        while (D < Q)
        {
            S += D * (Q - D - 1);
            S += Q * (Q + 1) / 2;
            D += 3;
            Q = N / D;
        }
        SA[d] = S;
        DA[d] = D;
        QA[d] = Q;
    }

    static Xint FSSD(Xint N)
    {
        Xint D = 1, Q = N, S = 0;
        while (D < Q)
        {
            S += Q * (Q + 1 + 2 * D) / 2;
            D += 1;
            Q = N / D;
        }
        S += Q * (Q + 1) / 2;
        S -= D * D * (D - 1) / 2;
        return S;
    }

    static Xint FSSD3(Xint N)
    {
        SA = new Xint[3];
        DA = new Xint[3];
        QA = new Xint[3];
        Parallel.For(0, 3, (int d) => FSSDP3(N, d));
        Xint S = SA[0] + SA[1] + SA[2],
             D = DA[0],
             Q = QA[0];
        if (DA[1] < D)
        {
            D = DA[1];
            Q = QA[1];
        }
        if (DA[2] < D)
        {
            D = DA[2];
            Q = QA[2];
        }
        S += Q * (Q + 1) / 2;
        S -= D * D * (D - 1) / 2;
        return S;
    }

    static void FSSDP3(Xint N, int d)
    {
        Xint D = d + 1, Q = N / D, S = 0;
        while (D < Q)
        {
            S += Q * (Q + 1 + 2 * D) / 2;
            D += 3;
            Q = N / D;
        }
        SA[d] = S;
        DA[d] = D;
        QA[d] = Q;
    }

    static Xint FSSD4(Xint N)
    {
        SA = new Xint[4];
        DA = new Xint[4];
        QA = new Xint[4];
        Parallel.For(0, 4, (int d) => FSSDP4(N, d));
        Xint S = SA[0] + SA[1] + SA[2] + SA[3],
             D = DA[0],
             Q = QA[0];
        if (DA[1] < D)
        {
            D = DA[1];
            Q = QA[1];
        }
        if (DA[2] < D)
        {
            D = DA[2];
            Q = QA[2];
        }
        if (DA[3] < D)
        {
            D = DA[3];
            Q = QA[3];
        }
        S += Q * (Q + 1) / 2;
        S -= D * D * (D - 1) / 2;
        return S;
    }

    static void FSSDP4(Xint N, int d)
    {
        Xint D = d + 1, Q = N / D, S = 0;
        while (D < Q)
        {
            S += Q * (Q + 1 + 2 * D) / 2;
            D += 4;
            Q = N / D;
        }
        SA[d] = S;
        DA[d] = D;
        QA[d] = Q;
    }
}

2014/01/23

The kth Permutation

/*

It is based on:

    "Using Permutations in .Net for Improved System Security"

Factoradics are used to find the kth permutation of order n.

*/

class Permutation
{
    private static uint[] kth_perm(uint k, uint n)  //  k=0,n=3 ==> {0,1,2}
    {                                               //  k=1,n=3 ==> {0,2,1}
        if (n == 0) return new uint[] { };
        uint[] perm = new uint[n++];
        uint[] fdic = new uint[n];                  //  in factoradic form, no factorials!
        uint i = 1, d, m = n - 1;
        do
        {
            d = k / i;
            fdic[m--] = k - d * i++;
            k = d;
        }
        while (m != ~0u);
        for (k = n-- - 2; k != ~0u; k--)
        {
            m = perm[k++] = fdic[k];
            for (i = k--; i < n; i++) if (perm[i] >= m) perm[i]++;
        }
        return perm;
    }

    private static uint[] next_perm(uint[] perm)    //  {0,1,2} ==> {0,2,1}
    {
        int len = perm.Length - 1;
        if (len <= 0) return perm;
        int i = len;
        while (i > 0 && perm[i--] <= perm[i]) ;
        uint swap = perm[i];
        int j = len;
        while (j > 0 && perm[j] <= swap) j--;
        if (j == 0)
            do
            {
                swap = perm[len];
                perm[len--] = perm[j];
                perm[j++] = swap;
            }
            while (j < len);
        else
        {
            perm[i++] = perm[j];
            perm[j] = swap;
            while (len > i)
            {
                swap = perm[len];
                perm[len--] = perm[i];
                perm[i++] = swap;
            }
        }
        return perm;
    }

    static void Main()
    {
        uint[] perm = kth_perm(999999, 10);  //            {2,7,8,3,9,1,5,4,6,0}
        perm = next_perm(perm);              //            {2,7,8,3,9,1,5,6,0,4}
        perm = new uint[] { 3, 1, 3, 1 };    //  {3,1,3,1}
        perm = next_perm(perm);              //  {3,3,1,1}
        perm = next_perm(perm);              //  {1,1,3,3}
        perm = next_perm(perm);              //  {1,3,1,3}
        perm = next_perm(perm);              //  {1,3,3,1}
        perm = next_perm(perm);              //  {3,1,1,3}
        perm = next_perm(perm);              //  {3,1,3,1}
    }
}

2014/01/20

The mth Combination

/*
In "www.MWC58 Fast RNG" appeared an off topic item: 
The mth 2-element subset of the set { 0, 1 , 2 , 3 , ... , n } , or

    "Generating the mth Lexicographical Element of a Mathematical Combination"

The author, Dr. James McCaffrey, remarks: "Huh?" Also by McCaffrey:

    "Using Permutations in .Net for Improved Systems Security"

Factorials (factoradics) are used to find the mth permutation of order n.
It might be a good idea to read that first. A few years ago I published code 
based on it, sorry, the site disappeared, but: The kth Permutation
 
Binomial Coefficients (combinadics) are used to find the mth combination of order n.
McCaffrey observed: "when you map from an index value to a vector value, alternate number
representations (factoradics, combinadics, ...) are often the key to an elegant solution".
 
A few examples:

A set contains 1 item (n=1): "0" (it will become clear that it is easier to work zero
based, like indices in an array), so we can choose 1 subset (k=1) , the set itself: "0"
With n=2 , the set is: {0,1} and k=1 , the first (m=0) subset is: "0" , the second one
(m=1) : "1" . With k=2 , 1 subset, the set itself.
    
     n=1                n=2          n=2          n=3           n=3          n=3 
     k=1                k=1          k=2          k=1           k=2          k=3
    s=set={0}          s={0,1}      s={0,1}      s={0,1,2}     s={0,1,2}    s={0,1,2}
    m  ss(=subset)     m  ss        m  ss        m  ss         m  ss        m  ss
    0  0               0  0         0  0,1       0  0          0  0,1       0  0,1,2
    1  0               1  1         1  0,1       1  1          1  0,2       1  0,1,2
    ....               2  0         ......       2  2          2  1,2       ........
                       ....                      3  0          3  0,1       
                                                 ....          ......        

The number of possible combinations: " n! / k! / (n-k)! " = Choose(n,k) = bnm(n,k)

         
Another combination algorithm:

    Gosper's hack (item 175) 

A description: A Novel Application (snoob, page 14)    

A remark from the author, Henry S. Warren: ... the problem of finding the next higher number
after a given number that has the same number of 1-bits. You are forgiven if you are asking,
"Why on earth would anyone want to compute that?" The "\\\\////" image above is made with it,
turn it 90 degrees, spot the pattern.
*/

class Combination
{
    private static byte snoob(byte b)
    {
        byte smallest, ripple, ones;              //   x = 00001010  {1,3}
        smallest = (byte)(b & (~b + 1));          //       00000010
        ripple = (byte)(b + smallest);            //       00001100
        ones = (byte)(b ^ ripple);                //       00000110
        ones = (byte)((ones >> 2) / smallest);    //       00000000
        return (byte)(ripple | ones);             //       00001100  {2,3}
    }

    private static uint[] comb = new uint[] { };
    private static uint g_n = 0, g_k = 0;

    private static void next_comb(uint n, uint k)
    {
        if (k == 0 || k > n) comb = new uint[] { };
        else
        {
            if (n != g_n || k != g_k)
            {
                g_n = n; g_k = k; comb = new uint[k];
                for (n = 1; n < k; ) comb[n] = n++;
            }
            else
            {
                if (comb[0] == n - k) for (n = 0; n < k; ) comb[n] = n++;
                else
                {
                    k--; n--; while (comb[k--] == n--) ;
                    k++; n = ++comb[k]; while (++k < g_k) comb[k] = ++n;
                    n = g_n;
                }
            }
        }
    }

    private static void next_comb() { next_comb(g_n, g_k); }

    private static uint[] mth_comb(uint m, uint n, uint k)
    {
        if (k == 0 || k > n) return new uint[] { };
        uint i = 0, nk = n - k;
        g_n = n - 1; g_k = k - 1;
        comb = new uint[k];
        uint c = bnm(n, k);
        m = c - m % c - 1;
        c = c * nk-- / n--;
        while (i < g_k)
        {
            while (c > m) c = c * nk-- / n--;  //  while (c > m) c = c * nk-- / n--;
            m -= c;                            //  McCaffrey: A binary search approach for
            comb[i++] = g_n - n;               //  large values of n could improve performance.
            c = c * k-- / n--;                 //      www.bitlength factorial  
        }                                      //  ==> approximation bitlength binomial,
        while (c > m) c = c * nk-- / n--;      //  ==> a way to narrow the search ?
        comb[i] = g_n - n;
        g_n++; g_k++;
        return comb;
    }

    private static uint bnm(uint n, uint k)
    {
        if (k > n) return 0;
        if (k > n / 2) k = n - k;
        uint nk = n - k, c = k = 1;
        while (n > nk) c = c * n-- / k++;
        return c;
    }

    static void Main()
    {
        mth_comb(5, 5, 3);  //  { 0, 3 , 4 }
        next_comb();        //  { 1, 2 , 3 }
        next_comb(4, 2);    //  { 0, 1 }
        next_comb();        //  { 0, 2 }
    }

    private static ulong bnm64(uint n, uint k)  //  Binomial Coefficient needs an update
    {
        checked
        {
            if (k > n) return 0;
            if (k > n / 2) k = n - k;
            uint nk = n - k;
            uint c = k = 1;
            ulong cc;
            try
            {
                while (n > nk) c = c * n-- / k++;
                return c;
            }
            catch
            {
                try
                {
                    n++;
                    cc = c;
                    while (n > nk) cc = cc * n-- / k++;
                    return cc;
                }
                catch
                {
                    unchecked
                    {
                        return 0;
                        // using System.Numerics.BigInteger;
                        // n++; 
                        // BigInteger C = cc; 
                        // while (n > nk) C = C * n-- / k++;
                        // return C;
                    }
                }
            }
        }
    }
}

2014/01/06

CMR63 Fast Scaled Random Number Generator part 5

/*
"CMR" stands for "Constant Multiply Rotate", 
"63" stands for the period: > 2^63 (almost 2^64).
It's  based on a publication by Mark A. Overton(2011):

    http://www. Fast, High-Quality, Parallel Random Number Generators
 
CMR63:

    private static uint cmr()
    {
        z0 *= m0;
        z1 *= m1;
        z0 = z0 << s0 | z0 >> r0;
        z1 = z1 << s1 | z1 >> r1;
        return z0 ^ z1;
    }

The constants can be found in: http://www.OvertonTables.zip (cmr-32.txt) 

A small part of the cmr-32 table:

    cmr-32:  x = Constant*x; x = rotl(x,RL); 
    Seed value can be 1, or SeedStart..(SeedStart+SeedCnt-1). 
    SB = number of bits in seed (2^SB <= SeedCnt).
    SeedCnt  SeedStart  SB   Period    Constant  RL Prime factors of period 
    -------- ---------- -- ---------- ---------- -- ----------------------- 
    62973467 1377002680 25 4294966876 3563976171 16 2^2 1073741719 
    33525602 3202323436 24 4294965919 1422968075 16 307 757 18481 
    32022541 1180658772 24 4294966152 1977089609 19 2^3 3 41 1051 4153 
    31280659  554048116 24 4294966449  433149435 17 3 1259 1137137
     2749407 2613580375 21 4294950337  272690735 19 4294950337 (prime) 
     1095230 3119024045 20 4294928147   64333559 18 4294928147 (prime) 
      998895 3255944955 19 4294915769 3152644205 13 4294915769 (prime) 
      664885 3993266363 19 4294881427 4031235431 15 4294881427 (prime) 

The full table contains 5586 data rows, 272 periods are prime. 
The smallest period is 4284340603. Thus the smallest period of two
combined generators (with periods having no common factors) is at least:
    4284340603^2  
    4284340603^2 > 2^63.99  ==>  2^63.99 < period < 2^64
I use seed values to get unique pairs of constants (multipliers),
assuming random sequences generated by those unique pairs are not
correlated. Constants belonging to the 272 prime periods can be combined
with other constants (belonging to 272 of the 5586 - 272 non prime periods).
A more refined scheme might get the number of combinations closer to 5586/2. 
On my pc (Athlon II X4 640, 3GHz, 2GB, XP) 10^9 iterations of CMR63 take
6.27 seconds, so the full period takes about 3600 years! 
    2^63.99 / 10^9 * 6.27 / (60 * 60 * 24 * 365) ~ 3600
 
The scaled version:

"cmr_uint(u)" returns a value x ( 0 <= x <= u ).
The time to generate one scaled random number is 21 ns, mean 14 ns,
more accurate, mean maximum time: 21 ns, mean mean is mean: 14 ns.
Lots of random bits are spoiled, it does not matter: 3600 years! 
 
CMR16:  
 
Uses rotation (shift) value 16. Periods of the sub generators
( z2 = m2 * ... , z3 = m3 * ... ) are prime.
Result: 22 Divergent High Quality Random Number Sequences.
*/

class cmr63
{
    private static int s0 = 16, r0 = 16, s1 = 15, r1 = 17; // init_cmr(0);
    private static uint m0 = 3563976171, z0 = 4125873261,
                        m1 = 4031235431, z1 = 3803445283;

    private static void init_cmr(uint seed)
    {
        byte[] s = { 16, 16, 19, 17, 19, 18, 13, 15 };
        uint[] m = { 3563976171, 1422968075, 1977089609,  433149435,
                      272690735,   64333559, 3152644205, 4031235431};
        seed &= 3; m0 = m[seed]; s0 = s[seed]; r0 = 32 - s0; z0 = 1;
        seed ^= 7; m1 = m[seed]; s1 = s[seed]; r1 = 32 - s1; z1 = 1;
        cmr();
    }

    private static uint cmr()
    {
        z0 *= m0;
        z1 *= m1;
        z0 = z0 << s0 | z0 >> r0;
        z1 = z1 << s1 | z1 >> r1;
        return z0 ^ z1;
    }

    private static uint g_u = 0;
    private static int g_s = 0;

    private static uint cmr_uint(uint u)
    {
        uint x;
        if (u == 0) return 0;
        if (u != g_u)
        {
            g_u = u;
            g_s = 32 - nob(u);
        }
        do
        {
            z0 *= m0;
            z1 *= m1;
            z0 = z0 << s0 | z0 >> r0;
            z1 = z1 << s1 | z1 >> r1;
            x = (z0 ^ z1) >> g_s;
        }
        while (x > u);
        return x;
    }

    private static int nob(uint u) // number of bits, position of the highest set bit, u > 0  
    {
        return
        u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 :
                                                                    u < 1u << 03 ? 03 : 04 :
                                                     u < 1u << 06 ? u < 1u << 05 ? 05 : 06 :
                                                                    u < 1u << 07 ? 07 : 08 :
                                      u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 :
                                                                    u < 1u << 11 ? 11 : 12 :
                                                     u < 1u << 14 ? u < 1u << 13 ? 13 : 14 :
                                                                    u < 1u << 15 ? 15 : 16 :
                       u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 :
                                                                    u < 1u << 19 ? 19 : 20 :
                                                     u < 1u << 22 ? u < 1u << 21 ? 21 : 22 :
                                                                    u < 1u << 23 ? 23 : 24 :
                                      u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 :
                                                                    u < 1u << 27 ? 27 : 28 :
                                                     u < 1u << 30 ? u < 1u << 29 ? 29 : 30 :
                                                                    u < 1u << 31 ? 31 : 32;
    }

    static void Main()
    {
        // http://www.TestU01.BigCrush ?
    }

    private static uint m2 = 3745979853, z2 = 500031303,
                        m3 = 0623716905, z3 = 707339565;

    private static void init_cmr16(uint seed)
    {
        uint[] m = { 3745979853, 4055716687, 3693386591, 3542220329,
                     1775851103, 1866916287, 4188393139, 4141129223,
                     1173908643, 3198474053,   11119693, 1282266473,
                     4076777453, 3908725387, 3293562383, 2492630213,
                     1818407027,  608828557,  872259061, 2075607481,
                     1573125557, 2615661665, 1402711077, 3212405133,
                      680154359, 2023590663, 3458456891, 4184846215,
                     2408125305, 2558924297, 3008413683,  466035855,
                     1647905439, 2930730743,  733571709, 3997625831,
                     1919196763, 3392242035,  100431167,  579587817,
                     3074845609, 1931914705, 3131462569,  623716905};
        seed %= 22;
        m2 = m[seed];
        m3 = m[43 - seed];
        z2 = z3 = 1;
        cmr16();
    }

    private static uint cmr16()  //  2^63.99 < period < 2^64
    {
        z2 *= m2;
        z3 *= m3;
        z2 = z2 << 16 | z2 >> 16;
        z3 = z3 << 16 | z3 >> 16;
        return z2 ^ z3;
    }
}

2013/12/30

MWC58 Fast RNG small period part 4

/*
"MWC" stands for "Multiply With Carry", "58" stands for the period: > 2^58 
Actually the period is larger than 2^60, the first version had a smaller period.
A big drawback: The usable period is a lot smaller. Despite this drawback,
MWC58 might be usefull:
-128 non correlated sequences of  random numbers,
 each sequence contains at least 590 million numbers,
 sequences are repeatable, initialisation is fast.
-seed another RNG

It's  based on: Marsaglia's message(1998)
    
MWC58:
  
    private static uint mwc58()
    {
        z0 = m0 * (z0 & 65535) + (z0 >> 16);   //  both (m0<<15)-1 and (m0<<16)-1 are prime
        z1 = m1 * (z1 & 65535) + (z1 >> 16);   //  both (m1<<15)-1 and (m1<<16)-1 are prime
        return z0 + (z1 << 16);
    }

Why is this fast? From a publication by Mark A. Overton(2011):
 
    Fast, High-Quality, Parallel Random Number Generators
    
Two independent generators are used: z0 = m0 * ... , z1 = m1 * ...   
The compiler places instructions in an order that causes the processor to 
pipeline the computations of the generators. As a result of this parallelizing, 
the generators execute even faster than expected. The second multiply, z1 = m1 * ...
only adds one more clock cycle.  

The period "p0" of the first generator is: p0 = m0 * 2^15 ,
with a seed value "s0": 0 < s0 <= 2 * p0 - 2 .
The period of MWC58: m0 * m1 * 2^30 .
The usable period is a lot smaller. The 16 lsb's (least significant bits) are
the 16 lsb's of the first generator: z0 , with period: p0 = m0 * 2^15.
The most significant bits depend much on the second generator: z1 ,
where the highest bits have the largest dependencies.
For the smallest value of m0: 18030, MWC58 generates 18030 * 2^15 (~5.9 * 10^8)
random numbers in ~2.25 seconds. In other words: When 5.9 * 10^8 random numbers
are needed, MWC58 generates them fast.

With 18030 <= m <= 65184 , there are 256 valid multipliers:
    ushort[] m = { 18030 ... 65184 };
With fixed multipliers it is possible to use a 32 bits seed value,
from which valid values for s0 and s1 can be dereived.
When we want different sequences, using different seed values,
to be more divergent (less correlated), it is better to use the seed for
indices in the "ushort[] m" array.

Off topic: The k-th (ordered) combination of { 0, 1, 2, 3, ... , n } , or
the k-th 2-element subset of the set { 0, 1, 2, 3, ... , n }
Triangular numbers and their inverse are involved.


 
    private static uint[] tri_com(uint k, uint n) // n > 0                //     n = 3   
    {                                                                     //   k | i | j 
        uint t, i, j;                                                     //  ---|---|---
        t = n * (n + 1) / 2;                                              //   0 | 0 | 1 
        k %= t;                                                           //   1 | 0 | 2 
        i = n - 1 - (uint)((Math.Sqrt(8 * (t - k - 1) + 1) - 1) / 2);     //   2 | 0 | 3 
        j = (i * (i + 1) / 2 + k) % n + 1;                                //   3 | 1 | 2 
        uint[] ij = { i, j };                                             //   4 | 1 | 3 
        return ij;                                                        //   5 | 2 | 3 
    }                                                                     //   6 | 0 | 1 
                                                                       
When combined indices are used like: 0,3 1,2 (2 combinations)
random sequences generated by those combinations are less (not?) correlated.
With 256 multipliers there are 128 combinations.
 
The scaled version:

"mwc_uint(u)" returns a value x ( 0 <= x <= u ).
Before I used a trick to save random bits, which is not necessary any longer.
An example: mwc_uint(1) returns 0 or 1.
Under the hood 32 random bits are/were generated, they were put in a buffer,
from which the bits were taken out, one by one.
That required some code: buffer empty? refill, etc.
Some code takes some time.
Now one bit is returned, 31 remaining bits are not used,
it is faster to generate 32 new bits, and to use just one of them.
The rather small period is a bottleneck, a solution: CMR63

Results:
    
    10^7 times rand.Next() takes: 107 ms (~11 ns per random number)
    10^7 times rand.Next(int someValue) takes: 236 ms (~24 ns ...)
    
    |--------|--------------------------------------------------------|      
    |        |                        ms 10^7 *                       |      rand.Next()    11 ns      
    |        |-------------|--------------|-------------|-------------|          mwc58()     4 ns
    |      u | mwc_uint(u) | well_uint(u) | gen_uint(u) | g_uint_c(u) |     rand.Next(i)    24 ns                 
    |--------|------|------|-------|------|-------|-----|------|------|      mwc_uint(u)    18 ns, mean  ~12 ns
    |      0 |   20 |      |    24 |      |    24 |     |   24 |      |     well_uint(u)    44 ns, mean  ~32 ns  
    |      1 |      |  51  |       |   74 |       |  73 |      |  139 |      gen_uint(u)    50 ns, mean  ~35 ns   
    |      2 |   96 |      |   121 |      |   124 |     |  296 |      |      g_uint_c(u)   280 ns, mean ~250 ns
    |      3 |      |  51  |       |   80 |       |  78 |      |  211 |       
    |      4 |  133 |      |   222 |      |   242 |     |  518 |      |                 
    |      7 |      |  51  |       |   82 |       |  83 |      |  282 |               
    |      8 |  153 |      |   272 |      |   290 |     |  680 |      |     
    |  2^7-1 |      |  51  |       |   98 |       | 102 |      |  562 |     
    |  2^7-0 |  172 |      |   344 |      |   380 |     | 1079 |      |           
    | 2^15-1 |      |  51  |       |  131 |       | 130 |      | 1123 |               
    | 2^15-0 |  174 |      |   379 |      |   414 |     | 1634 |      |               
    | 2^30-1 |      |  51  |       |  193 |       | 193 |      | 2164 |  
    | 2^30-0 |  175 |      |   431 |      |   494 |     | 2672 |      |  
    | 2^31-1 |      |  51  |       |  195 |       | 210 |      | 2251 |  
    | 2^31-0 |  175 |      |   427 |      |   461 |     | 2747 |      |  
    | 2^32-1 |      |  51  |       |  196 |       | 211 |      | 2307 |  
    |--------|------|------|-------|------|-------|-----|------|------| 

http://www.cacert.at/cgi-bin/rngresults  (code: find and hover mouse over "MWC58")
 
|---------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
|  Gene-  |  Speed  | Upload | Entropy  | Birthday | Matrix | 6*8 Ma- | Minimum  | Random   | Squeeze  | Overlap- |
|  rator  | (MiB/s) |  Size  | (->8)    | Spacing  | Ranks  | trix R  | Distance | Spheres  |          | ping Sums|
|-------- |---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
|   mwc58 |  1000   | 19 MiB | 7.999990 | 0.161056 |  0.504 |  0.920  | 0.877503 | 0.432728 | 0.677887 | 0.458630 |
| well512 |   400   | 19 MiB | 7.999992 | 0.446363 |  0.636 |  0.879  | 0.967495 | 0.690789 | 0.942252 | 0.005987 |
|  Random |   370   | 17 MiB | 7.999989 | 0.000000 |  0.392 |  0.136  | 0.063920 | 0.905097 | 0.098440 | 0.005467 |
|     CSP |    34   | 15 MiB | 7.999987 | 0.032102 |  0.084 |  0.942  | 0.894277 | 0.036221 | 0.706538 | 0.023523 |
|-------- |---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|

*/

using System;
class mwc
{
    private static uint m0 = 18030, z0 = 325080900, m1 = 65184, z1 = 4248953856;

    private static uint mwc58()
    {
        z0 = m0 * (z0 & 65535) + (z0 >> 16);
        z1 = m1 * (z1 & 65535) + (z1 >> 16);
        return z0 + (z1 << 16);
    }

    private static uint g_u = 0;
    private static int g_s = 0;

    private static uint mwc_uint(uint u)
    {
        uint x;
        if (u == 0) return 0;
        if (u != g_u)
        {
            g_u = u;
            g_s = 32 - nob(u);
        }
        do
        {
            z0 = m0 * (z0 & 65535) + (z0 >> 16);
            z1 = m1 * (z1 & 65535) + (z1 >> 16);
            x = (z0 + (z1 << 16)) >> g_s;
        }
        while (x > u);
        return x;
    }

    private static void init(uint seed)
    {
        ushort[] m = {
            18030, 18273, 18513, 18879, 19074, 19098, 19164, 19215, 19584, 19599,
            19950, 20088, 20508, 20544, 20664, 20814, 20970, 21153, 21243, 21423,
            21723, 21954, 22125, 22188, 22293, 22860, 22938, 22965, 22974, 23109,
            23124, 23163, 23208, 23508, 23520, 23553, 23658, 23865, 24114, 24219,
            24660, 24699, 24864, 24948, 25023, 25308, 25443, 26004, 26088, 26154,
            26550, 26679, 26838, 27183, 27258, 27753, 27795, 27810, 27834, 27960,
            28320, 28380, 28689, 28710, 28794, 28854, 28959, 28980, 29013, 29379,
            29889, 30135, 30345, 30459, 30714, 30903, 30963, 31059, 31083, 31215,
            31353, 31488, 31743, 32430, 32718, 33105, 33189, 33249, 33375, 33378,
            33663, 33768, 33858, 33894, 34158, 34323, 34383, 34590, 34653, 34890,
            35355, 35523, 35643, 36309, 36594, 36804, 36969, 37698, 37935, 37959,
            38079, 38223, 38283, 38484, 38568, 38610, 38649, 38733, 38850, 39444,
            39618, 39690, 39948, 40833, 40995, 41019, 41064, 41289, 41628, 41793,
            41874, 42153, 42444, 42513, 42594, 42633, 42699, 42819, 42903, 42975,
            43038, 43155, 43473, 43563, 43995, 44019, 44568, 44574, 44994, 45723,
            45729, 45780, 45789, 45915, 45939, 46515, 47088, 47529, 48015, 48033,
            48195, 48204, 48393, 49209, 49248, 49299, 49458, 50034, 50223, 50580,
            50589, 50694, 50853, 50988, 51198, 51558, 51618, 51729, 51744, 51813,
            51873, 51933, 52023, 52215, 52275, 52509, 52743, 52950, 53130, 53199,
            53529, 53709, 53898, 53934, 53958, 54144, 54168, 54399, 54474, 54564,
            54885, 55044, 55074, 55179, 55254, 55680, 55809, 55848, 55869, 56205,
            56538, 56604, 56790, 56859, 57039, 57204, 57225, 57525, 57603, 57774,
            57780, 57918, 58149, 58368, 58443, 58758, 59253, 59325, 59775, 60009,
            60060, 60489, 60735, 60990, 61140, 61578, 61914, 62505, 62634, 62778,
            62790, 62865, 62874, 62904, 63129, 63273, 63444, 63663, 63765, 63885,
            64185, 64314, 64455, 64545, 64860, 65184 };

        seed &= 127; m0 = m[seed]; z0 = m0 * m0;
        seed ^= 255; m1 = m[seed]; z1 = m1 * m1;
    }

    private static int nob(uint u) // number of bits, position of the highest set bit, u > 0  
    {
        return
        u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 :
                                                                    u < 1u << 03 ? 03 : 04 :
                                                     u < 1u << 06 ? u < 1u << 05 ? 05 : 06 :
                                                                    u < 1u << 07 ? 07 : 08 :
                                      u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 :
                                                                    u < 1u << 11 ? 11 : 12 :
                                                     u < 1u << 14 ? u < 1u << 13 ? 13 : 14 :
                                                                    u < 1u << 15 ? 15 : 16 :
                       u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 :
                                                                    u < 1u << 19 ? 19 : 20 :
                                                     u < 1u << 22 ? u < 1u << 21 ? 21 : 22 :
                                                                    u < 1u << 23 ? 23 : 24 :
                                      u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 :
                                                                    u < 1u << 27 ? 27 : 28 :
                                                     u < 1u << 30 ? u < 1u << 29 ? 29 : 30 :
                                                                    u < 1u << 31 ? 31 : 32;
    }

    static void Main()
    {
        // test ( part 1 ? )
    }    
}

2013/12/09

WELL512 scaled random numbers part 3


/*
"WELL" stands for "Well Equidistributed Long-period Linear",
"512" stands for the period: 2^512
Chris Lomont's version is used, visit:

    http://www.lomont.org/Math/Papers/2008/Lomont_PRNG_2008.pdf
  
and read about the advantages of the "WELL512" PRNG.
A "state" array of 16 uints is used, it's seeded by another PRNG: "System.Random".
"System.Random" uses a seed, the absolute value of an int.
Sequences of random numbers generated by System.Random, using different seed values,
are correlated, the left side of the image above clearly shows patterns.
The right side looks pretty random, sequences appear to be less correlated.
In other words: It is possible to regenerate a high quality random sequence,
using the same seed value. Different sequences, using different seed value's, 
seem to be less correlated.


    Results:
    
    10^7 times rand.Next() takes: 107 ms (~11 ns per random number)
    10^7 times rand.Next(int someValue) takes: 236 ms (~24 ns ...)
    
    |--------|------------------------------------------|       rand.Next()    11 ns      
    |        |                 ms 10^7 *                |      rand.Next(i)    24 ns              
    |        |--------------|-------------|-------------|      well_uint(u)    44 ns, mean  ~32 ns
    |      u | well_uint(u) | gen_uint(u) | g_uint_c(u) |       gen_uint(u)    50 ns, mean  ~35 ns    
    |--------|-------|------|-------|-----|------|------|       g_uint_c(u)   280 ns, mean ~250 ns
    |      0 |    24 |      |    24 |     |   24 |      |         
    |      1 |       |   74 |       |  73 |      |  139 |          
    |      2 |   121 |      |   124 |     |  296 |      |            
    |      3 |       |   80 |       |  78 |      |  211 |       
    |      4 |   222 |      |   242 |     |  518 |      |                 
    |      7 |       |   82 |       |  83 |      |  282 |               
    |      8 |   272 |      |   290 |     |  680 |      |     
    |  2^7-1 |       |   98 |       | 102 |      |  562 |     
    |  2^7-0 |   344 |      |   380 |     | 1079 |      |           
    | 2^15-1 |       |  131 |       | 130 |      | 1123 |               
    | 2^15-0 |   379 |      |   414 |     | 1634 |      |               
    | 2^30-1 |       |  193 |       | 193 |      | 2164 |  
    | 2^30-0 |   431 |      |   494 |     | 2672 |      |  
    | 2^31-1 |       |  195 |       | 210 |      | 2251 |  
    | 2^31-0 |   427 |      |   461 |     | 2747 |      |  
    | 2^32-1 |       |  196 |       | 211 |      | 2307 |  
    |--------|-------|------|-------|-----|------|------|


http://www.cacert.at/cgi-bin/rngresults  find and hover mouse over "well512" for the code.

|---------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
|  Gene-  |  Speed  | Upload | Entropy  | Birthday | Matrix | 6*8 Ma- | Minimum  | Random   | Squeeze  | Overlap- |
|  rator  | (MiB/s) |  Size  | (->8)    | Spacing  | Ranks  | trix R  | Distance | Spheres  |          | ping Sums|
|-------- |---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
| WELL512 |   400   | 19 MiB | 7.999992 | 0.446363 |  0.636 |  0.879  | 0.967495 | 0.690789 | 0.942252 | 0.005987 |
|  Random |   370   | 17 MiB | 7.999989 | 0.000000 |  0.392 |  0.136  | 0.063920 | 0.905097 | 0.098440 | 0.005467 |
|     CSP |    34   | 15 MiB | 7.999987 | 0.032102 |  0.084 |  0.942  | 0.894277 | 0.036221 | 0.706538 | 0.023523 |
|-------- |---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|  
  
*/

using System;
using System.Drawing; // add a ref

class scaled_well512
{
    private static uint[] state = new uint[16];
    private static int index = -1;

    private static void init_well(int seed)
    {
        seed = Math.Abs(seed);
        index = seed & 15;
        var rand = new Random(seed);
        uint u = (uint)rand.Next(1 << 30);
        for (int i = 0; i < 15; i++)
        {
            state[i] = (uint)rand.Next(1 << 30) << 2 | u & 3u;
            state[i] ^= (uint)rand.Next(1 << 30);
            state[i] ^= (uint)rand.Next(1 << 30);
            u >>= 2;
        }
        state[15] = (uint)rand.Next(1 << 30) << 2 | (uint)rand.Next(4);
        state[15] ^= (uint)rand.Next(1 << 30);
        state[15] ^= (uint)rand.Next(1 << 30);
    }

    private static uint well512()
    {
        uint a, b, c, d;
        a = state[index];
        c = state[index + 13 & 15];
        b = a ^ c ^ (a << 16) ^ (c << 15);
        c = state[index + 9 & 15];
        c ^= c >> 11;
        a = state[index] = b ^ c;
        d = a ^ ((a << 5) & 0xda442d24u);
        index = index + 15 & 15;
        a = state[index];
        state[index] = a ^ b ^ d ^ (a << 2) ^ (b << 18) ^ (c << 28);
        return state[index];
    }

    private static ulong buf = 0;              // 64 random bits buffer
    private static int av = 0;                 // available bits in buf
    private static uint g_u = 0;               // copy of u
    private static int g_nd = 0;               // nd (needed) bits = highest set bit of u
    private static int g_s = 0;                // shift
    private static bool ones = false;          // true if u = 10, 110, 1110, ...

    private static uint well_uint(uint u)
    {
        uint x; int s;
        if (u == 0) return 0;
        if (u != g_u)
        {
            g_u = u;
            g_nd = nob(u);
            g_s = 64 - g_nd;
            ones = (u + 1 & u + 2) == 0;
        }
    L0: if (av < g_nd)
        {
            av += 32;
            buf <<= 32;
            buf |= well512();
        }
        x = (uint)buf << g_s >> g_s;
        buf >>= g_nd;
        av -= g_nd;
        if (x <= u) return x;
        if (ones) goto L0;                     // faster when u = 2, 6, 14, ...
        s = nob(x ^ u) - 1;
        av += s;
        buf <<= s;
        buf ^= x;
        goto L0;
    }

    private static int nob(uint u) // number of bits, position of the highest set bit, u > 0  
    {
        return
        u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 :
                                                                    u < 1u << 03 ? 03 : 04 :
                                                     u < 1u << 06 ? u < 1u << 05 ? 05 : 06 :
                                                                    u < 1u << 07 ? 07 : 08 :
                                      u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 :
                                                                    u < 1u << 11 ? 11 : 12 :
                                                     u < 1u << 14 ? u < 1u << 13 ? 13 : 14 :
                                                                    u < 1u << 15 ? 15 : 16 :
                       u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 :
                                                                    u < 1u << 19 ? 19 : 20 :
                                                     u < 1u << 22 ? u < 1u << 21 ? 21 : 22 :
                                                                    u < 1u << 23 ? 23 : 24 :
                                      u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 :
                                                                    u < 1u << 27 ? 27 : 28 :
                                                     u < 1u << 30 ? u < 1u << 29 ? 29 : 30 :
                                                                    u < 1u << 31 ? 31 : 32;
    }

    static void Main()
    {
        random_bitmap();
        well_bitmap();
    }

    private static void random_bitmap()
    {
        int xmax = 450;
        int ymax = 450;
        var bmp = new Bitmap(xmax + 1, ymax + 1);
        int x, y, i = 0; uint z;
        for (x = 0; x <= xmax; x++)
        {
            for (y = 0; y < ymax; y++)
            {
                var rand = new Random(i++);
                z = (uint)rand.Next(1 << 30);
                if ((z & 1) == 0) bmp.SetPixel(x, y, Color.White);
                else bmp.SetPixel(x, y, Color.Black);
            }
        }
        bmp.Save(xmax.ToString() + "R.bmp");
    }

    private static void well_bitmap()
    {
        int xmax = 450;
        int ymax = 450;
        var bmp = new Bitmap(xmax + 1, ymax + 1);
        int x, y, i = 0; uint z;
        for (x = 0; x <= xmax; x++)
        {
            for (y = 0; y < ymax; y++)
            {
                init_well(i);
                i += 1;
                //z = well512();
                z = well_uint(1 << 30);
                z >>= 0;
                if ((z & 1) == 0) bmp.SetPixel(x, y, Color.White);
                else bmp.SetPixel(x, y, Color.Black);
            }
        }
        bmp.Save(xmax.ToString() + "W.bmp");
    }
}