Pagina's

2016/08/14

Odd prime sieve


/* 105,000,000 Odd primes in 1 second on an i7-4790. It's almost a sieve of Eratosthenes.
Composites are marked by 24 threads running in cache friendly ranges, after they're counted,
a prime array of exactly the right size is filled with ... primes. Bit twiddling hacks used:
"count bits set, in parallel" and "count trailing zero bits with a de Bruijn sequence".
...Code colorized with: C# Syntax Highlighter...                  125,000,000 ? scroll down
 
            Old times      New times    (in seconds).  
      n      x      p       x      p    p(rimes) <= n < 2^32
    1e9   0.50   0.66    0.31   0.42    x: odd composites, marked in 
    2e9   1.02   1.31    0.63   0.84       essentially a bit array.
    4e9   2.13   2.65    1.29   1.70    best of 5
    ~0u   2.28   2.87    1.39   1.82    best of 5
*/
using System;
using System.Threading.Tasks;
using sw = System.Diagnostics.Stopwatch;
class oddPrimes
{
    static sw sw = new sw();
    static void Main()
    {
        uint m = (uint)1e9;
        sw.Start(); uint[] p = getPrimes(m);
        Console.Write(p.Length + " odd primes <= " + m);
        Console.Read();
    }

    static uint[] getPrimes(uint m) { return countPrimes(buildX(m)); }

    static uint[] buildX(uint m)  // odd composites
    {
        uint i, j, t, r; uint[] x, r0, r1;
        t = (uint)Environment.ProcessorCount * 3; r = 32768 * 6;
        m -= m / ~0u; m += m & 1; m /= 2;
        x = new uint[(m >> 5) + 1];
        x[m >> 5] = ~0u << (int)m;
        if (m > 32) x[0] = 0x9b4b3491;
        for (i = j = 0; i < m; i += j += 4) x[i >> 5] |= 1u << (int)i;
        r0 = new uint[t + 1]; r1 = new uint[t + 1];
        for (i = 0; i <= t; i++) { r0[i] = i * r; r1[i] = (i + 1) * r; } r *= t;
        while (r0[0] < m)
        {
            Parallel.For(0, t, k =>
            {
                uint b, c, d, e, f, g, m0, m1;
                if ((m0 = r0[k]) >= m) return;
                if ((m1 = r1[k]) > m) m1 = m;
                for (d = 3, b = e = 4; ; b += e += 4, d += 2)
                {
                    if ((c = b + d) >= m1) break;
                    if ((x[d >> 6] & (1 << (int)(d >> 1))) == 0)
                    {
                        if (c < m0) if ((c += (m0 - c) / d * d) < m0) c += d;
                        for (g = d << 1, f = c + d; f < m1; c += g, f += g)
                        { x[c >> 5] |= 1u << (int)c; x[f >> 5] |= 1u << (int)f; }
                        for (; c < m1; c += g) x[c >> 5] |= 1u << (int)c;
                    }
                }
            });
            for (j = 0; j <= t; j++) { r0[j] += r; r1[j] += r; }
        }
        time("x");
        return x;
    }

    static uint[] countPrimes(uint[] x)
    {
        int t = Environment.ProcessorCount; uint[] c = new uint[t];
        byte[] b = new byte[x.Length]; uint n;
        Parallel.For(0, t, k =>
        {
            uint xi, ck = 0;
            for (int i = x.Length - 1 - k; i >= 0; i -= t)
            {
                xi = x[i]; xi -= (xi >> 1) & 0x55555555;
                xi = (xi & 0x33333333) + (xi >> 2 & 0x33333333);
                ck += b[i] = (byte)(32 - ((xi + (xi >> 4) & 0xf0f0f0f) * 0x1010101 >> 24));
            }
            c[k] = ck;
        });
        for (n = c[0], t--; t > 0; t--) n += c[t];
        time("c");
        return buildPrimes(b, x, n);
    }

    static uint[] buildPrimes(byte[] b, uint[] x, uint n)
    {
        int j = b.Length - 1; uint[] c = new uint[8]; uint[] p = new uint[n];
        sbyte[] bruijn = {01,02,29,03,30,15,25,04,31,23,21,16,26,18,05,09,
                          32,28,14,24,22,20,17,08,27,13,19,07,12,06,11,10};
        for (uint s = 0, k = 1; k < 8 && k <= j; k++) c[k] += s += b[k - 1];
        Parallel.For(0, 8, k =>
        {
            int s, i = k; uint ck = c[k], ci, u, v; long xi;
            if (k > 0 && ck == 0) return;
            for (u = (uint)(k << 6) - 1; ; u += 512)
            {
                v = u; ci = ck; xi = ~x[i];
                while (xi > 0)
                {
                    s = bruijn[((uint)((xi & -xi) * 0x077cb531)) >> 27];
                    p[ci++] = v += (uint)s << 1;
                    xi >>= s;
                }
                i += 8; if (i > j) break;
                ck += (uint)(b[i - 1] + b[i - 2] + b[i - 3] + b[i - 4] +
                             b[i - 5] + b[i - 6] + b[i - 7] + b[i - 8]);
            }
        });
        time("p");
        return p;
    }

    static void time(string s) { Console.WriteLine(sw.ElapsedMilliseconds + " ms " + s); }
}
/* Nothing new under the sun, pre-sieving 3, 5 & 7.
First I did it the old way (scroll down), but a hard
coded table works faster. Actually the table can be
made "on the fly", with a few more small primes?

            Old times      New times (in seconds).
      n      x      p       x      p
    1e9   0.31   0.42    0.24   0.34
    2e9   0.63   0.84    0.48   0.68
    4e9   1.29   1.70    0.99   1.50 (average of 10 runs)
    ~0u   1.39   1.82    1.07   1.60 (average of 10000 ;)
*/
using System;
using System.Threading.Tasks;
using sw = System.Diagnostics.Stopwatch;
class oddPrimes
{
    static sw sw = new sw();
    static void Main()
    {
        uint m = (uint)1e9;
        sw.Start(); uint[] p = getPrimes(m);
        Console.Write(p.Length + " odd primes <= " + m);
        Console.Read();
    }

    static uint[] getPrimes(uint m) { return countPrimes(buildX(m)); }

    static uint[] buildX(uint m)
    {
        uint i, j, t, r; uint[] x, r0, r1;
        t = (uint)Environment.ProcessorCount * 3; r = 32768 * 6;
        m -= m / ~0u; m += m & 1; m >>= 1;
        x = new uint[(m >> 5) + 1]; x[0] = 0x9b4b3491;
        preMark((int)m >> 5, x);
        for (i = j = 0; i < m; i += j += 4) x[i >> 5] |= 1u << (int)i;
        r0 = new uint[t]; r1 = new uint[t];
        for (i = 0; i < t; i++) { r0[i] = i * r; r1[i] = (i + 1) * r; } r *= t;
        while (r0[0] < m)
        {
            Parallel.For(0, t, k =>
            {
                uint b, c, d, e, f, g, m0, m1;
                if ((m0 = r0[k]) >= m) return;
                if ((m1 = r1[k]) > m) m1 = m;
                for (d = 11, b = 60, e = 20; ; b += e += 4, d += 2)
                {
                    if ((c = b + d) >= m1) break;
                    if ((x[d >> 6] & (1 << (int)(d >> 1))) == 0)
                    {
                        if (c < m0) if ((c += (m0 - c) / d * d) < m0) c += d;
                        for (g = d << 1, f = c + d; f < m1; c += g, f += g)
                        { x[c >> 5] |= 1u << (int)c; x[f >> 5] |= 1u << (int)f; }
                        for (; c < m1; c += g) x[c >> 5] |= 1u << (int)c;
                    }
                }
            });
            for (j = 0; j < t; j++) { r0[j] += r; r1[j] += r; }
        }
        x[m >> 5] |= ~0u << (int)m;
        time("x");
        return x;
    }

    static void preMark(int m, uint[] x)
    {
        uint[] a =  // 3*5*7 uints = 420 bytes
        {
            0x6e92ed65,0x59a5b34d,0x96693cf2,0x25dacb36,0x4b669add,0xd279e4b3,0xb5966d2c,  // No
            0xcd35ba4b,0xf3c96696,0x2cda59a4,0x6b74976b,0x92cd2d9a,0xb4b349e7,0xe92ed659,  // co
            0x9a5b34d6,0x6693cf25,0x5dacb369,0xb669add2,0x279e4b34,0x5966d2cd,0xd35ba4bb,  // de
            0x3c96696c,0xcda59a4f,0xb74976b2,0x2cd2d9a6,0x4b349e79,0x92ed659b,0xa5b34d6e,  // an
            0x693cf259,0xdacb3696,0x669add25,0x79e4b34b,0x966d2cd2,0x35ba4bb5,0xc96696cd,  // aly
            0xda59a4f3,0x74976b2c,0xcd2d9a6b,0xb349e792,0x2ed659b4,0x5b34d6e9,0x93cf259a,  // sis
            0xacb36966,0x69add25d,0x9e4b34b6,0x66d2cd27,0x5ba4bb59,0x96696cd3,0xa59a4f3c,  // iss
            0x4976b2cd,0xd2d9a6b7,0x349e792c,0xed659b4b,0xb34d6e92,0x3cf259a5,0xcb369669,  // ue
            0x9add25da,0xe4b34b66,0x6d2cd279,0xba4bb596,0x6696cd35,0x59a4f3c9,0x976b2cda,  // s
            0x2d9a6b74,0x49e792cd,0xd659b4b3,0x34d6e92e,0xcf259a5b,0xb3696693,0xadd25dac,  // we
            0x4b34b669,0xd2cd279e,0xa4bb5966,0x696cd35b,0x9a4f3c96,0x76b2cda5,0xd9a6b749,  // re
            0x9e792cd2,0x659b4b34,0x4d6e92ed,0xf259a5b3,0x3696693c,0xdd25dacb,0xb34b669a,  // de
            0x2cd279e4,0x4bb5966d,0x96cd35ba,0xa4f3c966,0x6b2cda59,0x9a6b7497,0xe792cd2d,  // tec
            0x59b4b349,0xd6e92ed6,0x259a5b34,0x696693cf,0xd25dacb3,0x34b669ad,0xcd279e4b,  // te
            0xbb5966d2,0x6cd35ba4,0x4f3c9669,0xb2cda59a,0xa6b74976,0x792cd2d9,0x9b4b349e,  // d.
        };
        Parallel.For(0, 8, k =>
        {
            for (int i, j, m0 = 1 + k * 105, m1 = m0 + 104; m0 <= m; m0 += 840, m1 += 840)
            { if (m1 > m) m1 = m; j = 0; i = m0; while (i <= m1) x[i++] = a[j++]; }
        });
    }

    static uint[] countPrimes(uint[] x)
    {
        uint[] c = new uint[8]; byte[] b = new byte[x.Length];
        Parallel.For(0, 8, k =>
        {
            uint xi, ck = 0;
            for (int i = x.Length - 1 - k; i >= 0; i -= 8)
            {
                xi = x[i]; xi -= xi >> 1 & 0x55555555;
                xi = (xi & 0x33333333) + (xi >> 2 & 0x33333333);
                ck += b[i] = (byte)(32 - ((xi + (xi >> 4) & 0xf0f0f0f) * 0x1010101 >> 24));
            }
            c[k] = ck;
        });
        time("c");
        return buildPrimes(b, x, c[0] + c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7]);
    }

    static uint[] buildPrimes(byte[] b, uint[] x, uint n)
    {
        int j = b.Length - 1; byte[] c = new byte[8]; uint[] p = new uint[n];
        sbyte[] bruijn = {01,02,29,03,30,15,25,04,31,23,21,16,26,18,05,09,
                          32,28,14,24,22,20,17,08,27,13,19,07,12,06,11,10};
        for (byte s = 0, k = 1; k < 8 && k <= j; k++) c[k] += s += b[k - 1];
        Parallel.For(0, 8, k =>
        {
            int s, i = k; uint ck = c[k], ci, u, v; long xi;
            if (k > 0 && ck == 0) return;
            for (u = (uint)(k << 6) - 1; ; u += 512)
            {
                v = u; ci = ck; xi = ~x[i];
                while (xi > 0)
                {
                    s = bruijn[((uint)((xi & -xi) * 0x077cb531)) >> 27];
                    p[ci++] = v += (uint)s << 1;
                    xi >>= s;
                }
                i += 8; if (i > j) break;
                ck += (uint)(b[i - 1] + b[i - 2] + b[i - 3] + b[i - 4] +
                             b[i - 5] + b[i - 6] + b[i - 7] + b[i - 8]);
            }
        });
        time("p");
        return p;
    }

    static void time(string s) { Console.WriteLine(sw.ElapsedMilliseconds + " ms " + s); }
}

/*
    static void preMark(int m, uint[] x)  // the old way
    {
        Parallel.For(0, 3, k =>
        {
            if (k == 0) for (int i = 1; i <= m; i += 3) x[i] = 0x24924924;
            if (k == 1) for (int i = 2; i <= m; i += 3) x[i] = 0x49249249;
            if (k == 2) for (int i = 3; i <= m; i += 3) x[i] = 0x92492492;
        });
        Parallel.For(0, 5, k =>
        {
            if (k == 0) for (int i = 1; i <= m; i += 5) x[i] |= 0x42108421;
            if (k == 1) for (int i = 2; i <= m; i += 5) x[i] |= 0x10842108;
            if (k == 2) for (int i = 3; i <= m; i += 5) x[i] |= 0x84210842;
            if (k == 3) for (int i = 4; i <= m; i += 5) x[i] |= 0x21084210;
            if (k == 4) for (int i = 5; i <= m; i += 5) x[i] |= 0x08421084;
        });                                                                
        Parallel.For(0, 7, k =>                                            
        {                                                                  
            if (k == 0) for (int i = 1; i <= m; i += 7) x[i] |= 0x08102040;
            if (k == 1) for (int i = 2; i <= m; i += 7) x[i] |= 0x40810204;
            if (k == 2) for (int i = 3; i <= m; i += 7) x[i] |= 0x04081020;
            if (k == 3) for (int i = 4; i <= m; i += 7) x[i] |= 0x20408102;
            if (k == 4) for (int i = 5; i <= m; i += 7) x[i] |= 0x02040810;
            if (k == 5) for (int i = 6; i <= m; i += 7) x[i] |= 0x10204081;
            if (k == 6) for (int i = 7; i <= m; i += 7) x[i] |= 0x81020408;
        });
    }
*/

1 comment:

  1. I got here much interesting stuff. The post is great! Thanks for sharing it! Table E Strainer

    ReplyDelete