Pagina's

2013/09/26

Fast Cube Root ( < 2^32 )

/*                                                                                                            **
 The time to take a cube root from a 32 bits uint was: < 26 ns (mean: < 22 ns)
                                             new time: < 14 ns (mean: < 13 ns)
 
 The trick used for small square roots is used again: "unroll the loop".
 First step: A binary search, the "if" statements, large values ( >= 1u << 24 )
 are found relatively faster, small values ( < 64 ) are handled during the search.
 Second step: A jump into the unrolled loop, the "labels".
 
      |--------------------------------| 
      |         cro32(x)               |                         
      |-----|-----||------------|------|
      |   x |  ns ||          x |   ns |
      |-----|-----||------------|------|
      |   0 | 2,4 ||        511 |  4,6 |
      |   1 | 2,3 ||       1023 |  5,3 |
      |   3 | 2,3 ||       2047 |  5,3 |
      |   7 | 2,3 ||       4095 |  5,0 |
      |  15 | 3,6 ||      32768 |  7,2 |
      |  31 | 3,0 ||    1048576 |  8,5 |
      |  63 | 3,0 ||  268435456 | 12,0 |
      |  64 | 5,0 ||  536870912 | 12,3 |     heaviest case,
      |  65 | 5,0 || 1070599167 | 13,7 | <== 1070599167 is the
      | 100 | 5,0 || 1070599168 | 13,7 |     3th power of 1023, 
      | 127 | 5,0 || 1073741824 | 12,3 |     binary: 1111111111
      | 128 | 5,0 || 1073774592 | 12,0 |
      | 255 | 4,6 || 2147483648 | 12,3 |
      | 256 | 4,6 || 4294967295 | 12,0 |
      |-----|-----||------------|------|

       4294967295 roots in 53884 ms, mean time 12,55 ns, faster than square root!
         UPDATE 2013/11/04 50686 ms, mean time 11,80 ns, see: part 2
 
 ** Original version: Figure 2: Architecture of the iteration (alternative A)
    A digit-by-digit algorithm for radix-2 cube root and its implementation (2004)
    J.-A.Pineiro, J.D.Bruguera, L.Ciminiera, P.Montuschi
  
*/
using System;
using System.Diagnostics;

class cro_32
{
    private static uint cro32(uint x)
    {
        uint y = 4u, z = 16u, b = 0u;
        if (x < 1u << 24)
            if (x < 1u << 12)
                if (x < 1u << 06)
                    if (x < 1u << 03)
                        return x == 0u ? 0u : 1u;
                    else
                        return x < 27u ? 2u : 3u;
                else
                    if (x < 1u << 09) goto L8; else goto L7;
            else
                if (x < 1u << 18)
                    if (x < 1u << 15) goto L6; else goto L5;
                else
                    if (x < 1u << 21) goto L4; else goto L3;
        else
            if (x < 1u << 30)
                if (x < 1u << 27) goto L2; else goto L1;

        if (x >= 27u << 27) { x -= 27u << 27; z = 36u; y = 6u; } else { x -= 1u << 30; } goto M0;
    L1: if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; } else { x -= 1u << 27; } goto M1;
    L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2;
    L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3;
    L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4;
    L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5;
    L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6;
    L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7;
    L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8;

    M0: if (x >= 61u << 24) { x -= 61u << 24; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M1: b = (y + z) * 3 + 1u << 21; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M8: return x <= (y + z) * 3 ? y : y + 1u;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main()
    {
        cro32(225);
        cro32(1070599167);
        check_perfect_cubes32();
        time_cro32();
        cro32_all();
        Console.ReadLine();
    }

    private static void check_perfect_cubes32()
    {
        uint n = 0, a = 0, b = 1, c = 6;
        while (n <= 1625)
        {
            if (n != cro32(a)) Console.WriteLine("WRONG");
            a += b; b += c; c += 6; n++;
        }
        Console.WriteLine("CHECKED PERFECT CUBES");
    }

    private static void time_cro32()
    {
        int i, j; uint x; double t;
        uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65,
                     100, 127, 128, 255, 256, 511, 1023, 2047, 4095,
                     1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167,
                     1070599168,1u << 30,(1u << 30) + (1u << 15), 1u << 31, ~0u };
        Console.WriteLine("|------------|------|");
        Console.WriteLine("|          x |  ns  |");
        Console.WriteLine("|------------|------|");
        for (j = 0; j < a.Length; j++)
        {
            x = a[j];
            cro32(x);
            sw.Restart();
            for (i = 0; i < 10000000; i++) cro32(x);
            sw.Stop();
            t = sw.ElapsedMilliseconds;
            sw.Restart();
            for (i = 0; i < 10000000; i++) ; // nada
            sw.Stop();
            t -= sw.ElapsedMilliseconds;
            Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10);
        }
        Console.WriteLine("|------------|------|");
        Console.WriteLine();
    }

    private static void cro32_all()
    {
        uint x; double t;
        cro32(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) cro32(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x);
        // 4294967295 roots in 53884 ms, mean time 12,55 ns
    }
}

2013/09/02

Fast Square Root ( < 2^32 )


/*
The time to take a square root from a 32 bits uint was: ~35 ns, new time: <16 ns (mean: <14 ns).
 
It's based on: A square root hardware algorithm from Guy L. Steele,Jr. 

The slightly optimized C# version below is nearly twice as fast. A few "goto Label" statements
are used, they are fairly easy to remove, but I doubt it will be any faster (at all).
  
             |--------------------------------|
             |        Times "cro32(x)"        |
             |      Athlon X4, XP, 2 GB       |        
             |-----|-----||------------|------|
             |   x |  ns ||          x |   ns |
             |-----|-----||------------|------|
             |   0 | 1,6 ||        511 |  4,3 |
             |   1 | 1,6 ||       1023 |  4,6 |
             |   3 | 1,6 ||       2047 |  5,0 |
             |   7 | 2,3 ||       4095 |  5,6 |           4294967295 roots in 56962 ms, 
             |  15 | 2,3 ||      32768 |  6,6 |                     mean time 13,26 ns.
             |  31 | 3,6 ||    1048576 | 10,0 |
             |  63 | 3,0 ||  268435456 | 13,7 |
             |  64 | 4,3 ||  536870912 | 11,4 |
             |  65 | 4,3 || 1070599167 | 13,0 |
             | 100 | 5,0 || 1073741824 | 15,1 |
             | 127 | 4,3 || 1073741824 | 15,1 |
             | 128 | 4,3 || 1073774592 | 15,1 |
             | 255 | 4,0 || 2147483648 | 13,1 |
             | 256 | 4,7 || 4294967295 | 15,7 |
             |-----|-----||------------|------|

*/
using System;                                                     
using System.Diagnostics;                                         
                                                                  
class Square_Root                                                 
{                                                                 
    private static uint sro32(uint x)
    {
        uint y, z;
        if (x < 1u << 16)
            if (x < 1u << 08)
                if (x < 1u << 04) return x < 1u << 02 ? x + 3u >> 2 : x + 15u >> 3;
                else
                    if (x < 1u << 06)
                    { y = 1u << 03; x -= 1u << 04; if (x >= 5u << 02) { x -= 5u << 02; y |= 1u << 02; } goto L0; }
                    else
                    { y = 1u << 05; x -= 1u << 06; if (x >= 5u << 04) { x -= 5u << 04; y |= 1u << 04; } goto L1; }
            else                                             // slower (on my pc): .... y = 3u << 04; } goto L1; }
                if (x < 1u << 12)
                    if (x < 1u << 10)
                    { y = 1u << 07; x -= 1u << 08; if (x >= 5u << 06) { x -= 5u << 06; y |= 1u << 06; } goto L2; }
                    else
                    { y = 1u << 09; x -= 1u << 10; if (x >= 5u << 08) { x -= 5u << 08; y |= 1u << 08; } goto L3; }
                else
                    if (x < 1u << 14)
                    { y = 1u << 11; x -= 1u << 12; if (x >= 5u << 10) { x -= 5u << 10; y |= 1u << 10; } goto L4; }
                    else
                    { y = 1u << 13; x -= 1u << 14; if (x >= 5u << 12) { x -= 5u << 12; y |= 1u << 12; } goto L5; }
        else
            if (x < 1u << 24)
                if (x < 1u << 20)
                    if (x < 1u << 18)
                    { y = 1u << 15; x -= 1u << 16; if (x >= 5u << 14) { x -= 5u << 14; y |= 1u << 14; } goto L6; }
                    else
                    { y = 1u << 17; x -= 1u << 18; if (x >= 5u << 16) { x -= 5u << 16; y |= 1u << 16; } goto L7; }
                else
                    if (x < 1u << 22)
                    { y = 1u << 19; x -= 1u << 20; if (x >= 5u << 18) { x -= 5u << 18; y |= 1u << 18; } goto L8; }
                    else
                    { y = 1u << 21; x -= 1u << 22; if (x >= 5u << 20) { x -= 5u << 20; y |= 1u << 20; } goto L9; }
            else
                if (x < 1u << 28)
                    if (x < 1u << 26)
                    { y = 1u << 23; x -= 1u << 24; if (x >= 5u << 22) { x -= 5u << 22; y |= 1u << 22; } goto La; }
                    else
                    { y = 1u << 25; x -= 1u << 26; if (x >= 5u << 24) { x -= 5u << 24; y |= 1u << 24; } goto Lb; }
                else
                    if (x < 1u << 30)
                    { y = 1u << 27; x -= 1u << 28; if (x >= 5u << 26) { x -= 5u << 26; y |= 1u << 26; } goto Lc; }
                    else
                    { y = 1u << 29; x -= 1u << 30; if (x >= 5u << 28) { x -= 5u << 28; y |= 1u << 28; } }

        z = y | 1u << 26; y /= 2; if (x >= z) { x -= z; y |= 1u << 26; }
    Lc: z = y | 1u << 24; y /= 2; if (x >= z) { x -= z; y |= 1u << 24; }
    Lb: z = y | 1u << 22; y /= 2; if (x >= z) { x -= z; y |= 1u << 22; }
    La: z = y | 1u << 20; y /= 2; if (x >= z) { x -= z; y |= 1u << 20; }
    L9: z = y | 1u << 18; y /= 2; if (x >= z) { x -= z; y |= 1u << 18; }
    L8: z = y | 1u << 16; y /= 2; if (x >= z) { x -= z; y |= 1u << 16; }
    L7: z = y | 1u << 14; y /= 2; if (x >= z) { x -= z; y |= 1u << 14; }
    L6: z = y | 1u << 12; y /= 2; if (x >= z) { x -= z; y |= 1u << 12; }
    L5: z = y | 1u << 10; y /= 2; if (x >= z) { x -= z; y |= 1u << 10; }
    L4: z = y | 1u << 08; y /= 2; if (x >= z) { x -= z; y |= 1u << 08; }
    L3: z = y | 1u << 06; y /= 2; if (x >= z) { x -= z; y |= 1u << 06; }
    L2: z = y | 1u << 04; y /= 2; if (x >= z) { x -= z; y |= 1u << 04; }
    L1: z = y | 1u << 02; y /= 2; if (x >= z) { x -= z; y |= 1u << 02; }
    L0: return x > y ? y / 2 | 1u : y / 2;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main()
    {
        Console.WriteLine(sro32(4));
        Console.WriteLine(sro32(40));
        Console.WriteLine(sro32(400));
        Console.WriteLine(sro32(4000));
        Console.WriteLine(sro32(40000));
        Console.WriteLine(sro32(400000));
        Console.WriteLine(sro32(4000000));
        Console.WriteLine(sro32(40000000));
        Console.WriteLine(sro32(400000000));
        Console.WriteLine(sro32(4000000000));
        test_perfect_squares();
        time_sro32();
        sro32_all(); // ~1 minute                                 
        Console.ReadLine();
    }
    private static void test_perfect_squares()
    {
        uint n = 0, a = 0, b = 1;
        do
        {
            if (sro32(a) != n) Console.WriteLine("WRONG");
            a += b;
            b += 2;
        }
        while (n++ < 65535);
        Console.WriteLine("Perfect squares tested");
    }
    private static void time_sro32()
    {
        int i, j; uint x; double t;
        //uint[] a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
        uint[] a = { 0, 1, 3, 7, 15, 31, 63, 64, 65,
                     100, 127, 128, 255, 256, 511, 1023, 2047, 4095,
                     1u << 15, 1u << 20, 1u << 28 ,1u << 29 , 1070599167,
                     1u << 30,1u << 30,(1u << 30) + (1u << 15), 1u << 31, 0xfffe0001u, ~0u };
        Console.WriteLine("|------------|------|");
        Console.WriteLine("|          x |  ns  |");
        Console.WriteLine("|------------|------|");
        for (j = 0; j < a.Length; j++)
        {
            x = a[j];
            sro32(x);
            sw.Restart();
            for (i = 0; i < 10000000; i++) sro32(x);
            sw.Stop();
            t = sw.ElapsedMilliseconds;
            sw.Restart();
            for (i = 0; i < 10000000; i++) ; // nada
            sw.Stop();
            t -= sw.ElapsedMilliseconds;
            Console.WriteLine("| {0,10} | {1:00.0} |", x, t / 10);
        }
        Console.WriteLine("|------------|------|");
        Console.WriteLine();
    }
    private static void sro32_all()
    {
        uint x; double t;
        sro32(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) sro32(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns",
                          (double)(t) * 1000000 / x);
        // 4294967295 roots in 56962 ms, mean time 13,26 ns
    }
}
//------------------------------------------------------------
// Another one, slower, the line "t = ..." is done many times.
//
// private static uint sro32s(uint x)
//     uint y1 = 0, y2 = 0, t; int k = 15;
//     for (; k >= 0; k--)
//     {
//         t = y2 + (y1 << k + 1) | (1u << k * 2);
//         if (x >= t) { y2 = t; y1 |= 1u << k; }
//     }
//     return y1;
//-----------------------------------------------------------