Pagina's

2013/08/23

Small Cube Roots ( < 2^64 )


/*
The time to take a cube root from a 64 bits ulong was: ~330 ns, new time: < 75 ns
The time to take a cube root from a 32 bits  uint was:  ~25 ns, new time: < 22 ns (mean)
* UPDATE (2013-09-22) new time for 64 bits: < 70 ns, casting float to int is faster than 
                                                     casting float to uint.

The 64 bits version: 
 
    www.hackersdelight.org/hdcodetxt/acbrt.c.txt
 
    Why? 0x2a5137a0

A union data type is used, not available in C#, first work around: "BitConverter", at the
bottom: "cro64old(x)". It takes ~110 ns, "BitConverter" takes ~10 ns, it's used four times,
so 40 ns are gone. A better solution:
 
    Unions (or an equivalent) in C# - Sairama's Tip of the Day

A 63 bits version might be slightly faster, no overflow checks, less correction loops?
 
 
The 32 bits version:
 
A while loop is replaced by a "do while", leading (triple) zero bits are counted to avoid
useless iterations, all (4294967295) roots in 92,7 s, mean time 21,6 ns.
** UPDATE (2013-09-26) new time for 32 bits: < 15 ns, see: Fast Cube Root 
 
  
    |-------------||--------------------------||-------------------------------------------| 
    |  cro12(x)   ||       cr032(x)           ||               cro64(random x)             |
    |------|------||------------|-------------||------|----------------------|-------------|
    |    x |  ns  ||          x |     ns   ** || bits |                    x |     ns   *  |  
    |------|------||------------|------|------||------|----------------------|------|------|  
    |    0 |  3,3 ||          0 |  5,8 |  2,4 ||   64 | 13397031575470575331 | 74,7 | 69,8 |  
    |    1 |  3,7 ||          1 |  6,5 |  2,3 ||   63 |  6512002244030281881 | 70,4 | 66,8 |  
    |    3 |  5,2 ||          3 |  6,5 |  2,3 ||   62 |  4508122487018397160 | 73,6 | 68,5 |  
    |    7 |  5,3 ||          7 |  6,5 |  2,3 ||   61 |  1458795770683441400 | 74,0 | 66,8 |  
    |   15 |  6,3 ||         15 |  7,7 |  3,6 ||   60 |   994257496981222664 | 74,4 | 66,8 |  
    |   31 |  7,3 ||         31 |  7,5 |  3,0 ||   59 |   470915095588726850 | 74,0 | 66,8 |  
    |   63 |  7,3 ||         63 |  7,5 |  3,0 ||   58 |   201745106158251712 | 74,0 | 66,8 |  
    |   64 |  7,3 ||         64 |  9,4 |  4,8 ||   57 |    78804375734046440 | 74,0 | 66,8 |  
    |   65 |  8,3 ||         65 |  9,4 |  4,6 ||   56 |    57442872892622787 | 74,0 | 66,8 |  
    |  100 |  8,3 ||        100 |  9,4 |  4,6 ||   55 |    19109171792964730 | 74,0 | 66,8 |  
    |  127 |  9,3 ||        127 |  9,4 |  4,6 ||   54 |     9099755032911121 | 74,0 | 66,8 |  
    |  128 |  9,3 ||        128 |  9,4 |  4,6 ||   53 |     8170442697918990 | 74,0 | 66,8 |  
    |  255 | 10,3 ||        255 |  9,4 |  5,0 ||   52 |     2913033827815339 | 74,0 | 66,8 |  
    |  256 | 10,3 ||        256 |  9,4 |  5,0 ||   51 |     1395743698554671 | 74,0 | 66,8 |  
    |  511 | 11,3 ||        511 |  9,6 |  5,0 ||   50 |      929340477894354 | 74,0 | 66,8 |  
    | 1023 | 14,3 ||       1023 | 10,7 |  5,0 ||   49 |      351768287627885 | 74,1 | 66,8 |  
    | 2047 | 16,3 ||       2047 | 10,5 |  5,7 ||   48 |      165110734465934 | 70,9 | 66,5 |  
    | 4095 | 23,3 ||       4095 | 11,5 |  5,3 ||   47 |      116364019662847 | 73,6 | 66,8 |  
    |      |      ||      32768 | 13,7 |  6,4 ||   46 |       45894645838798 | 70,9 | 66,5 |  
    |      |      ||    1048576 | 15,7 |  8,7 ||   45 |       25652614759428 | 70,9 | 65,5 |  
    |      |      ||  268435456 | 19,7 | 11,0 ||   44 |       12123210057895 | 71,7 | 65,5 |  
    |      |      ||  536870912 | 20,1 | 12,0 ||   43 |        5902555891371 | 71,7 | 65,5 |  
    |      |      || 1073741824 | 25,8 | 11,3 ||   42 |        3490450430912 | 71,7 | 65,5 |  
    |      |      || 1073741824 | 25,8 | 11,3 ||   41 |        1149630654634 | 71,7 | 65,5 |  
    |      |      || 1073774592 | 25,8 | 11,3 ||   40 |         902265632303 | 71,7 | 65,5 |  
    |      |      || 2147483648 | 21,1 | 12,3 ||   39 |         343393482192 | 71,7 | 65,5 |  
    |      |      || 4294967295 | 21,8 | 12,3 ||   38 |         198281228228 | 71,7 | 65,5 |  
    |      |      ||            |      |      ||   37 |          93252157892 | 71,7 | 65,5 |  
    |      |      ||            |      |      ||   36 |          41910484903 | 71,7 | 65,5 |
    |      |      ||            |      |      ||   35 |          23282355985 | 71,8 | 65,5 |
    |      |      ||            |      |      ||   34 |           8677323163 | 71,7 | 65,5 |
    |      |      ||            |      |      ||   33 |           6714386675 | 71,7 | 65,5 |
    |------|------||------------|------|------||------|----------------------|------|------|
 
*/
using System;
using System.Diagnostics;
using System.Runtime.InteropServices;

[StructLayout(LayoutKind.Explicit)]

public struct fu_32   // float <==> uint
{
    [FieldOffset(0)]
    public float f;
    [FieldOffset(0)]
    public uint u;
}
class cro_64_32
{
    private static uint cro64(ulong x)
    {
        if (x >= 18446724184312856125) return 2642245;
        float fx = (float)x;
        fu_32 fu32 = new fu_32();
        fu32.f = fx;
        uint uy = fu32.u / 4;
        uy += uy / 4;
        uy += uy / 16;
        uy += uy / 256;
        uy += 0x2a5137a0;
        fu32.u = uy;
        float fy = fu32.f;
        fy = 0.33333333f * (fx / (fy * fy) + 2.0f * fy);

        // uint y1 = (uint)
        //    (0.33333333f * (fx / (fy * fy) + 2.0f * fy));

        int y0 = (int)                                      // 2013-09-22
            (0.33333333f * (fx / (fy * fy) + 2.0f * fy));   // 5 ns 
        uint y1 = (uint)y0;                                 // faster

        ulong y2, y3;
        if (y1 >= 2642245)
        {
            y1 = 2642245;
            y2 = 6981458640025;
            y3 = 18446724184312856125;
        }
        else
        {
            y2 = (ulong)y1 * y1;
            y3 = y2 * y1;
        }
        if (y3 > x)
        {
            y1 -= 1;
            y2 -= 2 * y1 + 1;
            y3 -= 3 * y2 + 3 * y1 + 1;
            while (y3 > x)
            {
                y1 -= 1;
                y2 -= 2 * y1 + 1;
                y3 -= 3 * y2 + 3 * y1 + 1;
            }
            return y1;
        }
        do
        {
            y3 += 3 * y2 + 3 * y1 + 1;
            y2 += 2 * y1 + 1;
            y1 += 1;
        }
        while (y3 <= x);
        return y1 - 1;
    }

    private static uint cro32(uint x)
    {
        uint y = 0, z = 0, b = 0;
        int s = x < 1u << 24 ? x < 1u << 12 ? x < 1u << 06 ? x < 1u << 03 ? 00 : 03 :
                                                             x < 1u << 09 ? 06 : 09 :
                                              x < 1u << 18 ? x < 1u << 15 ? 12 : 15 :
                                                             x < 1u << 21 ? 18 : 21 :
                               x >= 1u << 30 ? 30 : x < 1u << 27 ? 24 : 27;
        do
        {
            y *= 2;
            z *= 4;
            b = 3 * y + 3 * z + 1 << s;
            if (x >= b)
            {
                x -= b;
                z += 2 * y + 1;
                y += 1;
            }
            s -= 3;
        }
        while (s >= 0);
        return y;
    }

    private static uint cro12(uint x)
    {
        uint y = 0, a = 0, b = 1, c = 0;
        while (a < x)
        {
            y++;
            b += c;
            a += b;
            c += 6;
        }
        if (a != x) y--;
        return y;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main()
    {
        cro64(long.MaxValue);
        check_perfect_cubes64();
        check_perfect_cubes32();
        time_cro64();
        time_cro32();
        time_cro12();
        cro32_all();              // ~90 s, sorry, no cro64_all() ;) it would take ~44 millennia.  
        Console.ReadLine();
    }
    private static void check_perfect_cubes64()
    {
        ulong n = 0, a = 0, b = 1, c = 6;
        while (n <= 2642245)
        {
            if (n != cro64(a)) Console.WriteLine("WRONG");
            a += b; b += c; c += 6; n++;
        }
        Console.WriteLine("CHECKED64");
    }
    private static void check_perfect_cubes32()
    {
        uint n = 0, a = 0, b = 1, c = 6;
        while (n <= 1625)
        {
            if (n != cro32(a)) Console.WriteLine("WRONG");
            a += b; b += c; c += 6; n++;
        }
        Console.WriteLine("CHECKED32");
    }

    private static Random random = new Random(0);       // Random(0)/Random(), same/random sequence 
    private static uint rnd32()                         // random uint
    {
        return (uint)(random.Next(1 << 16)) << 16 |
               (uint)(random.Next(1 << 16));
    }
    private static uint rnd32(uint n)                   // random uint of n bits
    {
        if (n < 2) return n;
        if (n > 32) n = 32;
        return (((uint)(random.Next(1 << 16)) << 16 |
                 (uint)(random.Next(1 << 16)))
                >> 32 - (int)n) | (1u << (int)n - 1);
    }
    private static ulong rnd64(uint n)                 // random ulong of n bits
    {
        if (n < 2) return n;
        if (n > 64) n = 64;
        ulong u = rnd32();
        u = u << 32 | rnd32();
        u >>= 64 - (int)n;
        u |= 1uL << (int)n - 1;
        return u;
    }

    private static void time_cro64()
    {
        int i; uint j; ulong x; double t;
        Console.WriteLine("|------|----------------------|------|");
        Console.WriteLine("| bits |                    x |  ns  |");
        Console.WriteLine("|------|----------------------|------|");
        Console.WriteLine("|             PRESS A KEY            |");
        Console.ReadLine();
        for (j = 64; j > 32; j -= 1)
        {
            x = rnd64(j);
            cro64(x);
            sw.Restart();
            for (i = 0; i < 10000000; i++) cro64(x);
            sw.Stop();
            t = sw.ElapsedMilliseconds;
            sw.Restart();
            for (i = 0; i < 10000000; i++) ; // nada
            sw.Stop();
            t -= sw.ElapsedMilliseconds;
            Console.WriteLine("|   {0,2} | {1,20} | {2:00.0} |", j, x, t / 10);
        }
        Console.WriteLine("|------|----------------------|------|");
        Console.WriteLine();
    }
    private static void time_cro32()
    {
        int i, j; uint x; double t;
        uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65,
                     100, 127, 128, 255, 256, 511, 1023, 2047, 4095,
                     1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167,
                     1u << 30,1u << 30,(1u << 30) + (1u << 15), 1u << 31, ~0u };
        Console.WriteLine("|------------|------|");
        Console.WriteLine("|          x |  ns  |");
        Console.WriteLine("|------------|------|");
        for (j = 0; j < a.Length; j++)
        {
            x = a[j];
            cro32(x);
            sw.Restart();
            for (i = 0; i < 10000000; i++) cro32(x);
            sw.Stop();
            t = sw.ElapsedMilliseconds;
            sw.Restart();
            for (i = 0; i < 10000000; i++) ; // nada
            sw.Stop();
            t -= sw.ElapsedMilliseconds;
            Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10);
        }
        Console.WriteLine("|------------|------|");
        Console.WriteLine();
    }
    private static void time_cro12()
    {
        int i, j; uint x; double t;
        uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65,
                     100, 127, 128, 255, 256, 511, 1023, 2047, 4095 };
        Console.WriteLine("|------------|------|");
        Console.WriteLine("|          x |  ns  |");
        Console.WriteLine("|------------|------|");
        for (j = 0; j < a.Length; j++)
        {
            x = a[j];
            cro12(x);
            sw.Restart();
            for (i = 0; i < 10000000; i++) cro12(x);
            sw.Stop();
            t = sw.ElapsedMilliseconds;
            sw.Restart();
            for (i = 0; i < 10000000; i++) ; // nada
            sw.Stop();
            t -= sw.ElapsedMilliseconds;
            Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10);
        }
        Console.WriteLine("|------------|------|");
        Console.WriteLine();
    }
    private static void cro32_all()
    {
        uint x; double t;
        cro32(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) cro32(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns",
                          (double)(t) * 1000000 / x);
        // 4294967295 roots in 92732 ms, mean time 21,59 ns
    }
    private static uint cro64old(ulong x)
    {
        if (x >= 18446724184312856125) return 2642245;                       // avoid overflow
        float fx = (float)x;           //------------------------------------------------------\
        uint uy = BitConverter.ToUInt32(BitConverter.GetBytes(fx), 0) / 4;   // ~20 ns          \
        uy += uy / 4;                                                        //                  \
        uy += uy / 16;                                                       //                   \
        uy += uy / 256;                                                      //                    \
        uy += 0x2a5137a0;                                                    //                     \
        float fy = BitConverter.ToSingle(BitConverter.GetBytes(uy), 0);      // ~20 ns              32 bits
        fy = 0.33333333f * (fx / (fy * fy) + 2.0f * fy);                     //                     /
        uint y1 = (uint)(0.33333333f * (fx / (fy * fy) + 2.0f * fy));        //                    /
        ulong y2, y3;                                                        //                   /
        if (y1 >= 2642245)                                                   // avoid overflow   /
        {                                                                    //                 /
            y1 = 2642245;              //------------------------------------------------------/                                                  
            y2 = 6981458640025;
            y3 = 18446724184312856125;
        }
        else
        {
            y2 = (ulong)y1 * y1;
            y3 = y2 * y1;
        }
        if (y3 > x)
        {
            y1 -= 1;
            y2 -= 2 * y1 + 1;
            y3 -= 3 * y2 + 3 * y1 + 1;
            while (y3 > x)
            {
                y1 -= 1;
                y2 -= 2 * y1 + 1;
                y3 -= 3 * y2 + 3 * y1 + 1;
            }
            return y1;
        }
        do
        {
            y3 += 3 * y2 + 3 * y1 + 1;
            y2 += 2 * y1 + 1;
            y1 += 1;
        }
        while (y3 <= x);
        return y1 - 1;
    }
}

No comments:

Post a Comment