/*
 * 5799-WZQ (C) COPYRIGHT IBM CORPORATION 1987
 * LICENSED MATERIALS - PROPERTY OF IBM
 * REFER TO COPYRIGHT INSTRUCTIONS FORM NUMBER G120-2083
 */
/* $Header:mulu32.c 12.0$ */
/* $ACIS:mulu32.c 12.0$ */
/* $Source: /ibm/acis/usr/src/lib/libc/ca/gen/RCS/mulu32.c,v $ */

#ifndef lint
static char *rcsid = "$Header:mulu32.c 12.0$";
#endif

/* MULU32 -- GET 3 HIGH ORDER WORDS OF 3-WORD by 2-WORD PRODUCT

The arguments are u and v, where u is a 3-digit number, and v is 
a 2-digit number.The digits are in a numbering system with base 2 
power 32. The result is w, the first 3 digits of the product of 
these two numbers, in the same base.  

The algorithm used is based on one given in D.E. Knuth, The Art 
of Computer Programming, volume 2, Seminumerical Algorithms, 
section 4.3.1: Algorithm M, as modified by exercise 15.

Instead of using the longword arguments and result u, v and w in 
the function, we use instead the bit-identical shortword 
equivalents, a, b, and c.  We are writing in C, which appears to 
have no way of dealing with the maximum product of two 32-bit 
integers, which needs 64 bits to represent it. This means we have 
to get our result by multiplying short operands to give long 
results. If we were able to use long multiplies to get double 
long results we could write a function which needed fewer 
multiplies.  

Developing the full shortword product would result in 10 terms, 
but the accuracy we seek requires only the first five. Thus we 
only multiply those factors which we know will contribute to the 
desired accuracy. For example, in computing triple-precision 
powers of 10, it has been determined empirically that results 
accurate to 64 bits can be obtained by only using those factors 
from the terms whose sum is 4 or less. This permits only 13 
multiplies to be used, as opposed to 23 which would be needed if 
all the partial products were obtained. The table below 
illustrates the savings in this case: 

         v index:   0  1  2  3

                0   0  1  2  3
         u      1   1  2  3  4
       index:   2   2  3  4  x
                3   3  4  x  x
                4   4  x  x  x
                5   x  x  x  x

an x in the table represents a factor which isn't needed. We 
actually provide a sixth term, since our caller is assumed to be 
expecting a 3-longword array result, but only the first five are 
guaranteed accurate. This suffices to give 64-bit accuracy. */

void _mulu32 (u, v)                                            
unsigned long u[3], v[2];

{ /* Start function body. */
 union un
     {unsigned long l[3];
      unsigned short s[6];
     };
 union un a, b, c;
 short i,j;             /* i, j index a.s, b.s */
 unsigned short k;      /* k is carry */
 unsigned long t;       /* because products >= 2 power 16 */

/* [Initialize.] Set j to the highest index in v, the right 
argument. */

 j = 3;

/* Move arguments to locals so we can deal with them as either 
unsigned short or long. */

 a.l[0] = u[0];
 a.l[1] = u[1];
 a.l[2] = u[2];
 b.l[0] = v[0];
 b.l[1] = v[1];

/* Initialize result to zeros. */

 c.l[0] = c.l[1] = c.l[2] = 0;

/* [Outer loop.] We use a DO construction because we are assured 
that the factor v is not empty, and having a trailing decision is 
somewhat more economical than a leading decision. */

 do 
     { /* Start outer DO loop, on j. */

/* [Zero multiplier?] There is a reasonable chance that b.s[j] is 
zero, because one of the uses of this function is to generate 
powers of 10, and all of the smaller powers of 10 have one or 
more trailing zeros. Testing for this case allows us to save one 
multiply for each element of a.s. */

      if (b.s[j])
          
/* [Initialize i to the penultimate index of a.s, since the last 
value of a.s isn't needed to give us the accuracy we need; 
initialize carry value k to zero.] */ 

          {i = 4;
           k = 0;

/* The inner loop uses a DO construction, for much the same 
reason that we used one for the outer loop. */

           do 
               { /* Start inner DO loop, on i. */

/* [i-jth contribution wanted?] We only need to compute the first 
five elements of the result, since the low order terms aren't 
needed to give us the 64-bit accuracy we need. */ 

                if (4 >= i+j)

/* [Add and multiply.] The temporary long partial product term t 
is formed in two steps. The first step adds in the carry term 
from the last partial product and the current value of the result 
term. The second part is the actual multiplication of the a.s[i]th 
and the b.s[j]th terms, and this is done only if a.s[i] isn't zero. 
We expect to benefit from avoidance of multiplications by zero  
a.s[i] terms for the same reason that we expected to gain from 
avoiding multiplications by zero b.s[j] terms. */

                    {t = k + c.s[i+j+1]; /* Nonzero a.s[i]. */
                     if (a.s[i]) 
                         {t += a.s[i] * b.s[j];

/* [Store partial product in result and develop carry.] Since t 
is long and the elements of c.s are short, storing t in c.s[i+j+1] 
includes only the low order 16 bits of t. The high order bits, 
shifted to the low order position, are used as the carry amount. 
*/

                          c.s[i+j+1] = t;
                          k = t >> 16;
                         }
                    }  /* End nonzero a.s[i] sequence. */
               }  /* End inner DO sequence. */

/* [Loop on i.] Decrease i by one. Now if i >= 0, continue the 
inner loop. Otherwise, set c.s[j] to k. */

                while (0 <= --i);
               c.s[j] = k;
          }
     }  /* End outer DO sequence. */

/* [Loop on j.] Decrease j by one. Now if j >= 0, continue the 
outer loop. Otherwise the loop terminates. */

  while (0 <= --j);
 
/* [Exit.] Our caller expects the result in three-longword form. */
               
 u[0] = c.l[0];
 u[1] = c.l[1];
 u[2] = c.l[2];
}


