Commit 158142c2c2df728cfa3b5320c65534921a764f26

Authored by bellard
1 parent 4f716dc6

soft float support


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@1332 c046a42c-6fe2-441c-8c8c-71466251a162
Makefile.target
... ... @@ -229,7 +229,7 @@ ifeq ($(TARGET_ARCH), i386)
229 229 OBJS+= vm86.o
230 230 endif
231 231 ifeq ($(TARGET_ARCH), arm)
232   -OBJS+=nwfpe/softfloat.o nwfpe/fpa11.o nwfpe/fpa11_cpdo.o \
  232 +OBJS+=nwfpe/fpa11.o nwfpe/fpa11_cpdo.o \
233 233 nwfpe/fpa11_cpdt.o nwfpe/fpa11_cprt.o nwfpe/fpopcode.o nwfpe/single_cpdo.o \
234 234 nwfpe/double_cpdo.o nwfpe/extended_cpdo.o
235 235 endif
... ... @@ -237,8 +237,14 @@ SRCS:= $(OBJS:.o=.c)
237 237 OBJS+= libqemu.a
238 238  
239 239 # cpu emulator library
240   -LIBOBJS=exec.o kqemu.o translate-all.o cpu-exec.o\
  240 +LIBOBJS=exec.o kqemu.o translate-op.o translate-all.o cpu-exec.o\
241 241 translate.o op.o
  242 +ifdef CONFIG_SOFTFLOAT
  243 +LIBOBJS+=fpu/softfloat.o
  244 +else
  245 +LIBOBJS+=fpu/softfloat-native.o
  246 +endif
  247 +DEFINES+=-I$(SRC_PATH)/fpu
242 248  
243 249 ifeq ($(TARGET_ARCH), i386)
244 250 LIBOBJS+=helper.o helper2.o
... ... @@ -399,7 +405,9 @@ libqemu.a: $(LIBOBJS)
399 405  
400 406 translate.o: translate.c gen-op.h opc.h cpu.h
401 407  
402   -translate-all.o: translate-all.c op.h opc.h cpu.h
  408 +translate-all.o: translate-all.c opc.h cpu.h
  409 +
  410 +translate-op.o: translate-all.c op.h opc.h cpu.h
403 411  
404 412 op.h: op.o $(DYNGEN)
405 413 $(DYNGEN) -o $@ $<
... ...
configure
... ... @@ -593,6 +593,7 @@ fi
593 593 #echo "Creating $config_mak, $config_h and $target_dir/Makefile"
594 594  
595 595 mkdir -p $target_dir
  596 +mkdir -p $target_dir/fpu
596 597 if test "$target" = "arm-user" -o "$target" = "armeb-user" ; then
597 598 mkdir -p $target_dir/nwfpe
598 599 fi
... ... @@ -658,6 +659,10 @@ if test &quot;$target_user_only&quot; = &quot;yes&quot; ; then
658 659 echo "#define CONFIG_USER_ONLY 1" >> $config_h
659 660 fi
660 661  
  662 +if test "$target_cpu" = "arm" -o "$target_cpu" = "armeb" ; then
  663 + echo "CONFIG_SOFTFLOAT=yes" >> $config_mak
  664 + echo "#define CONFIG_SOFTFLOAT 1" >> $config_h
  665 +fi
661 666 # sdl defines
662 667  
663 668 if test "$target_user_only" = "no"; then
... ...
fpu/softfloat-macros.h 0 → 100644
  1 +
  2 +/*============================================================================
  3 +
  4 +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  5 +Arithmetic Package, Release 2b.
  6 +
  7 +Written by John R. Hauser. This work was made possible in part by the
  8 +International Computer Science Institute, located at Suite 600, 1947 Center
  9 +Street, Berkeley, California 94704. Funding was partially provided by the
  10 +National Science Foundation under grant MIP-9311980. The original version
  11 +of this code was written as part of a project to build a fixed-point vector
  12 +processor in collaboration with the University of California at Berkeley,
  13 +overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  14 +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 +arithmetic/SoftFloat.html'.
  16 +
  17 +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  18 +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 +INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
  24 +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25 +
  26 +Derivative works are acceptable, even for commercial purposes, so long as
  27 +(1) the source code for the derivative work includes prominent notice that
  28 +the work is derivative, and (2) the source code includes prominent notice with
  29 +these four paragraphs for those parts of this code that are retained.
  30 +
  31 +=============================================================================*/
  32 +
  33 +/*----------------------------------------------------------------------------
  34 +| Shifts `a' right by the number of bits given in `count'. If any nonzero
  35 +| bits are shifted off, they are ``jammed'' into the least significant bit of
  36 +| the result by setting the least significant bit to 1. The value of `count'
  37 +| can be arbitrarily large; in particular, if `count' is greater than 32, the
  38 +| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  39 +| The result is stored in the location pointed to by `zPtr'.
  40 +*----------------------------------------------------------------------------*/
  41 +
  42 +INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr )
  43 +{
  44 + bits32 z;
  45 +
  46 + if ( count == 0 ) {
  47 + z = a;
  48 + }
  49 + else if ( count < 32 ) {
  50 + z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
  51 + }
  52 + else {
  53 + z = ( a != 0 );
  54 + }
  55 + *zPtr = z;
  56 +
  57 +}
  58 +
  59 +/*----------------------------------------------------------------------------
  60 +| Shifts `a' right by the number of bits given in `count'. If any nonzero
  61 +| bits are shifted off, they are ``jammed'' into the least significant bit of
  62 +| the result by setting the least significant bit to 1. The value of `count'
  63 +| can be arbitrarily large; in particular, if `count' is greater than 64, the
  64 +| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  65 +| The result is stored in the location pointed to by `zPtr'.
  66 +*----------------------------------------------------------------------------*/
  67 +
  68 +INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr )
  69 +{
  70 + bits64 z;
  71 +
  72 + if ( count == 0 ) {
  73 + z = a;
  74 + }
  75 + else if ( count < 64 ) {
  76 + z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
  77 + }
  78 + else {
  79 + z = ( a != 0 );
  80 + }
  81 + *zPtr = z;
  82 +
  83 +}
  84 +
  85 +/*----------------------------------------------------------------------------
  86 +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  87 +| _plus_ the number of bits given in `count'. The shifted result is at most
  88 +| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  89 +| bits shifted off form a second 64-bit result as follows: The _last_ bit
  90 +| shifted off is the most-significant bit of the extra result, and the other
  91 +| 63 bits of the extra result are all zero if and only if _all_but_the_last_
  92 +| bits shifted off were all zero. This extra result is stored in the location
  93 +| pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  94 +| (This routine makes more sense if `a0' and `a1' are considered to form
  95 +| a fixed-point value with binary point between `a0' and `a1'. This fixed-
  96 +| point value is shifted right by the number of bits given in `count', and
  97 +| the integer part of the result is returned at the location pointed to by
  98 +| `z0Ptr'. The fractional part of the result may be slightly corrupted as
  99 +| described above, and is returned at the location pointed to by `z1Ptr'.)
  100 +*----------------------------------------------------------------------------*/
  101 +
  102 +INLINE void
  103 + shift64ExtraRightJamming(
  104 + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
  105 +{
  106 + bits64 z0, z1;
  107 + int8 negCount = ( - count ) & 63;
  108 +
  109 + if ( count == 0 ) {
  110 + z1 = a1;
  111 + z0 = a0;
  112 + }
  113 + else if ( count < 64 ) {
  114 + z1 = ( a0<<negCount ) | ( a1 != 0 );
  115 + z0 = a0>>count;
  116 + }
  117 + else {
  118 + if ( count == 64 ) {
  119 + z1 = a0 | ( a1 != 0 );
  120 + }
  121 + else {
  122 + z1 = ( ( a0 | a1 ) != 0 );
  123 + }
  124 + z0 = 0;
  125 + }
  126 + *z1Ptr = z1;
  127 + *z0Ptr = z0;
  128 +
  129 +}
  130 +
  131 +/*----------------------------------------------------------------------------
  132 +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  133 +| number of bits given in `count'. Any bits shifted off are lost. The value
  134 +| of `count' can be arbitrarily large; in particular, if `count' is greater
  135 +| than 128, the result will be 0. The result is broken into two 64-bit pieces
  136 +| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  137 +*----------------------------------------------------------------------------*/
  138 +
  139 +INLINE void
  140 + shift128Right(
  141 + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
  142 +{
  143 + bits64 z0, z1;
  144 + int8 negCount = ( - count ) & 63;
  145 +
  146 + if ( count == 0 ) {
  147 + z1 = a1;
  148 + z0 = a0;
  149 + }
  150 + else if ( count < 64 ) {
  151 + z1 = ( a0<<negCount ) | ( a1>>count );
  152 + z0 = a0>>count;
  153 + }
  154 + else {
  155 + z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
  156 + z0 = 0;
  157 + }
  158 + *z1Ptr = z1;
  159 + *z0Ptr = z0;
  160 +
  161 +}
  162 +
  163 +/*----------------------------------------------------------------------------
  164 +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  165 +| number of bits given in `count'. If any nonzero bits are shifted off, they
  166 +| are ``jammed'' into the least significant bit of the result by setting the
  167 +| least significant bit to 1. The value of `count' can be arbitrarily large;
  168 +| in particular, if `count' is greater than 128, the result will be either
  169 +| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  170 +| nonzero. The result is broken into two 64-bit pieces which are stored at
  171 +| the locations pointed to by `z0Ptr' and `z1Ptr'.
  172 +*----------------------------------------------------------------------------*/
  173 +
  174 +INLINE void
  175 + shift128RightJamming(
  176 + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
  177 +{
  178 + bits64 z0, z1;
  179 + int8 negCount = ( - count ) & 63;
  180 +
  181 + if ( count == 0 ) {
  182 + z1 = a1;
  183 + z0 = a0;
  184 + }
  185 + else if ( count < 64 ) {
  186 + z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
  187 + z0 = a0>>count;
  188 + }
  189 + else {
  190 + if ( count == 64 ) {
  191 + z1 = a0 | ( a1 != 0 );
  192 + }
  193 + else if ( count < 128 ) {
  194 + z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
  195 + }
  196 + else {
  197 + z1 = ( ( a0 | a1 ) != 0 );
  198 + }
  199 + z0 = 0;
  200 + }
  201 + *z1Ptr = z1;
  202 + *z0Ptr = z0;
  203 +
  204 +}
  205 +
  206 +/*----------------------------------------------------------------------------
  207 +| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  208 +| by 64 _plus_ the number of bits given in `count'. The shifted result is
  209 +| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  210 +| stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  211 +| off form a third 64-bit result as follows: The _last_ bit shifted off is
  212 +| the most-significant bit of the extra result, and the other 63 bits of the
  213 +| extra result are all zero if and only if _all_but_the_last_ bits shifted off
  214 +| were all zero. This extra result is stored in the location pointed to by
  215 +| `z2Ptr'. The value of `count' can be arbitrarily large.
  216 +| (This routine makes more sense if `a0', `a1', and `a2' are considered
  217 +| to form a fixed-point value with binary point between `a1' and `a2'. This
  218 +| fixed-point value is shifted right by the number of bits given in `count',
  219 +| and the integer part of the result is returned at the locations pointed to
  220 +| by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  221 +| corrupted as described above, and is returned at the location pointed to by
  222 +| `z2Ptr'.)
  223 +*----------------------------------------------------------------------------*/
  224 +
  225 +INLINE void
  226 + shift128ExtraRightJamming(
  227 + bits64 a0,
  228 + bits64 a1,
  229 + bits64 a2,
  230 + int16 count,
  231 + bits64 *z0Ptr,
  232 + bits64 *z1Ptr,
  233 + bits64 *z2Ptr
  234 + )
  235 +{
  236 + bits64 z0, z1, z2;
  237 + int8 negCount = ( - count ) & 63;
  238 +
  239 + if ( count == 0 ) {
  240 + z2 = a2;
  241 + z1 = a1;
  242 + z0 = a0;
  243 + }
  244 + else {
  245 + if ( count < 64 ) {
  246 + z2 = a1<<negCount;
  247 + z1 = ( a0<<negCount ) | ( a1>>count );
  248 + z0 = a0>>count;
  249 + }
  250 + else {
  251 + if ( count == 64 ) {
  252 + z2 = a1;
  253 + z1 = a0;
  254 + }
  255 + else {
  256 + a2 |= a1;
  257 + if ( count < 128 ) {
  258 + z2 = a0<<negCount;
  259 + z1 = a0>>( count & 63 );
  260 + }
  261 + else {
  262 + z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
  263 + z1 = 0;
  264 + }
  265 + }
  266 + z0 = 0;
  267 + }
  268 + z2 |= ( a2 != 0 );
  269 + }
  270 + *z2Ptr = z2;
  271 + *z1Ptr = z1;
  272 + *z0Ptr = z0;
  273 +
  274 +}
  275 +
  276 +/*----------------------------------------------------------------------------
  277 +| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  278 +| number of bits given in `count'. Any bits shifted off are lost. The value
  279 +| of `count' must be less than 64. The result is broken into two 64-bit
  280 +| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  281 +*----------------------------------------------------------------------------*/
  282 +
  283 +INLINE void
  284 + shortShift128Left(
  285 + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
  286 +{
  287 +
  288 + *z1Ptr = a1<<count;
  289 + *z0Ptr =
  290 + ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) );
  291 +
  292 +}
  293 +
  294 +/*----------------------------------------------------------------------------
  295 +| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  296 +| by the number of bits given in `count'. Any bits shifted off are lost.
  297 +| The value of `count' must be less than 64. The result is broken into three
  298 +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  299 +| `z1Ptr', and `z2Ptr'.
  300 +*----------------------------------------------------------------------------*/
  301 +
  302 +INLINE void
  303 + shortShift192Left(
  304 + bits64 a0,
  305 + bits64 a1,
  306 + bits64 a2,
  307 + int16 count,
  308 + bits64 *z0Ptr,
  309 + bits64 *z1Ptr,
  310 + bits64 *z2Ptr
  311 + )
  312 +{
  313 + bits64 z0, z1, z2;
  314 + int8 negCount;
  315 +
  316 + z2 = a2<<count;
  317 + z1 = a1<<count;
  318 + z0 = a0<<count;
  319 + if ( 0 < count ) {
  320 + negCount = ( ( - count ) & 63 );
  321 + z1 |= a2>>negCount;
  322 + z0 |= a1>>negCount;
  323 + }
  324 + *z2Ptr = z2;
  325 + *z1Ptr = z1;
  326 + *z0Ptr = z0;
  327 +
  328 +}
  329 +
  330 +/*----------------------------------------------------------------------------
  331 +| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  332 +| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  333 +| any carry out is lost. The result is broken into two 64-bit pieces which
  334 +| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  335 +*----------------------------------------------------------------------------*/
  336 +
  337 +INLINE void
  338 + add128(
  339 + bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
  340 +{
  341 + bits64 z1;
  342 +
  343 + z1 = a1 + b1;
  344 + *z1Ptr = z1;
  345 + *z0Ptr = a0 + b0 + ( z1 < a1 );
  346 +
  347 +}
  348 +
  349 +/*----------------------------------------------------------------------------
  350 +| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  351 +| 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  352 +| modulo 2^192, so any carry out is lost. The result is broken into three
  353 +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  354 +| `z1Ptr', and `z2Ptr'.
  355 +*----------------------------------------------------------------------------*/
  356 +
  357 +INLINE void
  358 + add192(
  359 + bits64 a0,
  360 + bits64 a1,
  361 + bits64 a2,
  362 + bits64 b0,
  363 + bits64 b1,
  364 + bits64 b2,
  365 + bits64 *z0Ptr,
  366 + bits64 *z1Ptr,
  367 + bits64 *z2Ptr
  368 + )
  369 +{
  370 + bits64 z0, z1, z2;
  371 + int8 carry0, carry1;
  372 +
  373 + z2 = a2 + b2;
  374 + carry1 = ( z2 < a2 );
  375 + z1 = a1 + b1;
  376 + carry0 = ( z1 < a1 );
  377 + z0 = a0 + b0;
  378 + z1 += carry1;
  379 + z0 += ( z1 < carry1 );
  380 + z0 += carry0;
  381 + *z2Ptr = z2;
  382 + *z1Ptr = z1;
  383 + *z0Ptr = z0;
  384 +
  385 +}
  386 +
  387 +/*----------------------------------------------------------------------------
  388 +| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  389 +| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  390 +| 2^128, so any borrow out (carry out) is lost. The result is broken into two
  391 +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  392 +| `z1Ptr'.
  393 +*----------------------------------------------------------------------------*/
  394 +
  395 +INLINE void
  396 + sub128(
  397 + bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
  398 +{
  399 +
  400 + *z1Ptr = a1 - b1;
  401 + *z0Ptr = a0 - b0 - ( a1 < b1 );
  402 +
  403 +}
  404 +
  405 +/*----------------------------------------------------------------------------
  406 +| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  407 +| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  408 +| Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  409 +| result is broken into three 64-bit pieces which are stored at the locations
  410 +| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  411 +*----------------------------------------------------------------------------*/
  412 +
  413 +INLINE void
  414 + sub192(
  415 + bits64 a0,
  416 + bits64 a1,
  417 + bits64 a2,
  418 + bits64 b0,
  419 + bits64 b1,
  420 + bits64 b2,
  421 + bits64 *z0Ptr,
  422 + bits64 *z1Ptr,
  423 + bits64 *z2Ptr
  424 + )
  425 +{
  426 + bits64 z0, z1, z2;
  427 + int8 borrow0, borrow1;
  428 +
  429 + z2 = a2 - b2;
  430 + borrow1 = ( a2 < b2 );
  431 + z1 = a1 - b1;
  432 + borrow0 = ( a1 < b1 );
  433 + z0 = a0 - b0;
  434 + z0 -= ( z1 < borrow1 );
  435 + z1 -= borrow1;
  436 + z0 -= borrow0;
  437 + *z2Ptr = z2;
  438 + *z1Ptr = z1;
  439 + *z0Ptr = z0;
  440 +
  441 +}
  442 +
  443 +/*----------------------------------------------------------------------------
  444 +| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  445 +| into two 64-bit pieces which are stored at the locations pointed to by
  446 +| `z0Ptr' and `z1Ptr'.
  447 +*----------------------------------------------------------------------------*/
  448 +
  449 +INLINE void mul64To128( bits64 a, bits64 b, bits64 *z0Ptr, bits64 *z1Ptr )
  450 +{
  451 + bits32 aHigh, aLow, bHigh, bLow;
  452 + bits64 z0, zMiddleA, zMiddleB, z1;
  453 +
  454 + aLow = a;
  455 + aHigh = a>>32;
  456 + bLow = b;
  457 + bHigh = b>>32;
  458 + z1 = ( (bits64) aLow ) * bLow;
  459 + zMiddleA = ( (bits64) aLow ) * bHigh;
  460 + zMiddleB = ( (bits64) aHigh ) * bLow;
  461 + z0 = ( (bits64) aHigh ) * bHigh;
  462 + zMiddleA += zMiddleB;
  463 + z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
  464 + zMiddleA <<= 32;
  465 + z1 += zMiddleA;
  466 + z0 += ( z1 < zMiddleA );
  467 + *z1Ptr = z1;
  468 + *z0Ptr = z0;
  469 +
  470 +}
  471 +
  472 +/*----------------------------------------------------------------------------
  473 +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  474 +| `b' to obtain a 192-bit product. The product is broken into three 64-bit
  475 +| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  476 +| `z2Ptr'.
  477 +*----------------------------------------------------------------------------*/
  478 +
  479 +INLINE void
  480 + mul128By64To192(
  481 + bits64 a0,
  482 + bits64 a1,
  483 + bits64 b,
  484 + bits64 *z0Ptr,
  485 + bits64 *z1Ptr,
  486 + bits64 *z2Ptr
  487 + )
  488 +{
  489 + bits64 z0, z1, z2, more1;
  490 +
  491 + mul64To128( a1, b, &z1, &z2 );
  492 + mul64To128( a0, b, &z0, &more1 );
  493 + add128( z0, more1, 0, z1, &z0, &z1 );
  494 + *z2Ptr = z2;
  495 + *z1Ptr = z1;
  496 + *z0Ptr = z0;
  497 +
  498 +}
  499 +
  500 +/*----------------------------------------------------------------------------
  501 +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  502 +| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  503 +| product. The product is broken into four 64-bit pieces which are stored at
  504 +| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  505 +*----------------------------------------------------------------------------*/
  506 +
  507 +INLINE void
  508 + mul128To256(
  509 + bits64 a0,
  510 + bits64 a1,
  511 + bits64 b0,
  512 + bits64 b1,
  513 + bits64 *z0Ptr,
  514 + bits64 *z1Ptr,
  515 + bits64 *z2Ptr,
  516 + bits64 *z3Ptr
  517 + )
  518 +{
  519 + bits64 z0, z1, z2, z3;
  520 + bits64 more1, more2;
  521 +
  522 + mul64To128( a1, b1, &z2, &z3 );
  523 + mul64To128( a1, b0, &z1, &more2 );
  524 + add128( z1, more2, 0, z2, &z1, &z2 );
  525 + mul64To128( a0, b0, &z0, &more1 );
  526 + add128( z0, more1, 0, z1, &z0, &z1 );
  527 + mul64To128( a0, b1, &more1, &more2 );
  528 + add128( more1, more2, 0, z2, &more1, &z2 );
  529 + add128( z0, z1, 0, more1, &z0, &z1 );
  530 + *z3Ptr = z3;
  531 + *z2Ptr = z2;
  532 + *z1Ptr = z1;
  533 + *z0Ptr = z0;
  534 +
  535 +}
  536 +
  537 +/*----------------------------------------------------------------------------
  538 +| Returns an approximation to the 64-bit integer quotient obtained by dividing
  539 +| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  540 +| divisor `b' must be at least 2^63. If q is the exact quotient truncated
  541 +| toward zero, the approximation returned lies between q and q + 2 inclusive.
  542 +| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  543 +| unsigned integer is returned.
  544 +*----------------------------------------------------------------------------*/
  545 +
  546 +static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b )
  547 +{
  548 + bits64 b0, b1;
  549 + bits64 rem0, rem1, term0, term1;
  550 + bits64 z;
  551 +
  552 + if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
  553 + b0 = b>>32;
  554 + z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
  555 + mul64To128( b, z, &term0, &term1 );
  556 + sub128( a0, a1, term0, term1, &rem0, &rem1 );
  557 + while ( ( (sbits64) rem0 ) < 0 ) {
  558 + z -= LIT64( 0x100000000 );
  559 + b1 = b<<32;
  560 + add128( rem0, rem1, b0, b1, &rem0, &rem1 );
  561 + }
  562 + rem0 = ( rem0<<32 ) | ( rem1>>32 );
  563 + z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
  564 + return z;
  565 +
  566 +}
  567 +
  568 +/*----------------------------------------------------------------------------
  569 +| Returns an approximation to the square root of the 32-bit significand given
  570 +| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  571 +| `aExp' (the least significant bit) is 1, the integer returned approximates
  572 +| 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  573 +| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  574 +| case, the approximation returned lies strictly within +/-2 of the exact
  575 +| value.
  576 +*----------------------------------------------------------------------------*/
  577 +
  578 +static bits32 estimateSqrt32( int16 aExp, bits32 a )
  579 +{
  580 + static const bits16 sqrtOddAdjustments[] = {
  581 + 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
  582 + 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
  583 + };
  584 + static const bits16 sqrtEvenAdjustments[] = {
  585 + 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
  586 + 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
  587 + };
  588 + int8 index;
  589 + bits32 z;
  590 +
  591 + index = ( a>>27 ) & 15;
  592 + if ( aExp & 1 ) {
  593 + z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ];
  594 + z = ( ( a / z )<<14 ) + ( z<<15 );
  595 + a >>= 1;
  596 + }
  597 + else {
  598 + z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ];
  599 + z = a / z + z;
  600 + z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
  601 + if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 );
  602 + }
  603 + return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 );
  604 +
  605 +}
  606 +
  607 +/*----------------------------------------------------------------------------
  608 +| Returns the number of leading 0 bits before the most-significant 1 bit of
  609 +| `a'. If `a' is zero, 32 is returned.
  610 +*----------------------------------------------------------------------------*/
  611 +
  612 +static int8 countLeadingZeros32( bits32 a )
  613 +{
  614 + static const int8 countLeadingZerosHigh[] = {
  615 + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  616 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  617 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  618 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  619 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  620 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  621 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  622 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  623 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  624 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  625 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  626 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  627 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  628 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  629 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  630 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  631 + };
  632 + int8 shiftCount;
  633 +
  634 + shiftCount = 0;
  635 + if ( a < 0x10000 ) {
  636 + shiftCount += 16;
  637 + a <<= 16;
  638 + }
  639 + if ( a < 0x1000000 ) {
  640 + shiftCount += 8;
  641 + a <<= 8;
  642 + }
  643 + shiftCount += countLeadingZerosHigh[ a>>24 ];
  644 + return shiftCount;
  645 +
  646 +}
  647 +
  648 +/*----------------------------------------------------------------------------
  649 +| Returns the number of leading 0 bits before the most-significant 1 bit of
  650 +| `a'. If `a' is zero, 64 is returned.
  651 +*----------------------------------------------------------------------------*/
  652 +
  653 +static int8 countLeadingZeros64( bits64 a )
  654 +{
  655 + int8 shiftCount;
  656 +
  657 + shiftCount = 0;
  658 + if ( a < ( (bits64) 1 )<<32 ) {
  659 + shiftCount += 32;
  660 + }
  661 + else {
  662 + a >>= 32;
  663 + }
  664 + shiftCount += countLeadingZeros32( a );
  665 + return shiftCount;
  666 +
  667 +}
  668 +
  669 +/*----------------------------------------------------------------------------
  670 +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  671 +| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  672 +| Otherwise, returns 0.
  673 +*----------------------------------------------------------------------------*/
  674 +
  675 +INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
  676 +{
  677 +
  678 + return ( a0 == b0 ) && ( a1 == b1 );
  679 +
  680 +}
  681 +
  682 +/*----------------------------------------------------------------------------
  683 +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  684 +| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  685 +| Otherwise, returns 0.
  686 +*----------------------------------------------------------------------------*/
  687 +
  688 +INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
  689 +{
  690 +
  691 + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
  692 +
  693 +}
  694 +
  695 +/*----------------------------------------------------------------------------
  696 +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  697 +| than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  698 +| returns 0.
  699 +*----------------------------------------------------------------------------*/
  700 +
  701 +INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
  702 +{
  703 +
  704 + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
  705 +
  706 +}
  707 +
  708 +/*----------------------------------------------------------------------------
  709 +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
  710 +| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
  711 +| Otherwise, returns 0.
  712 +*----------------------------------------------------------------------------*/
  713 +
  714 +INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
  715 +{
  716 +
  717 + return ( a0 != b0 ) || ( a1 != b1 );
  718 +
  719 +}
  720 +
... ...
fpu/softfloat-native.c 0 → 100644
  1 +/* Native implementation of soft float functions. Only a single status
  2 + context is supported */
  3 +#include "softfloat.h"
  4 +#include <math.h>
  5 +
  6 +void set_float_rounding_mode(int val STATUS_PARAM)
  7 +{
  8 + STATUS(float_rounding_mode) = val;
  9 +#if defined(_BSD) && !defined(__APPLE__)
  10 + fpsetround(val);
  11 +#elif defined(__arm__)
  12 + /* nothing to do */
  13 +#else
  14 + fesetround(val);
  15 +#endif
  16 +}
  17 +
  18 +#ifdef FLOATX80
  19 +void set_floatx80_rounding_precision(int val STATUS_PARAM)
  20 +{
  21 + STATUS(floatx80_rounding_precision) = val;
  22 +}
  23 +#endif
  24 +
  25 +#if defined(_BSD)
  26 +#define lrint(d) ((int32_t)rint(d))
  27 +#define llrint(d) ((int64_t)rint(d))
  28 +#endif
  29 +
  30 +#if defined(__powerpc__)
  31 +
  32 +/* correct (but slow) PowerPC rint() (glibc version is incorrect) */
  33 +double qemu_rint(double x)
  34 +{
  35 + double y = 4503599627370496.0;
  36 + if (fabs(x) >= y)
  37 + return x;
  38 + if (x < 0)
  39 + y = -y;
  40 + y = (x + y) - y;
  41 + if (y == 0.0)
  42 + y = copysign(y, x);
  43 + return y;
  44 +}
  45 +
  46 +#define rint qemu_rint
  47 +#endif
  48 +
  49 +/*----------------------------------------------------------------------------
  50 +| Software IEC/IEEE integer-to-floating-point conversion routines.
  51 +*----------------------------------------------------------------------------*/
  52 +float32 int32_to_float32(int v STATUS_PARAM)
  53 +{
  54 + return (float32)v;
  55 +}
  56 +
  57 +float64 int32_to_float64(int v STATUS_PARAM)
  58 +{
  59 + return (float64)v;
  60 +}
  61 +
  62 +#ifdef FLOATX80
  63 +floatx80 int32_to_floatx80(int v STATUS_PARAM)
  64 +{
  65 + return (floatx80)v;
  66 +}
  67 +#endif
  68 +float32 int64_to_float32( int64_t v STATUS_PARAM)
  69 +{
  70 + return (float32)v;
  71 +}
  72 +float64 int64_to_float64( int64_t v STATUS_PARAM)
  73 +{
  74 + return (float64)v;
  75 +}
  76 +#ifdef FLOATX80
  77 +floatx80 int64_to_floatx80( int64_t v STATUS_PARAM)
  78 +{
  79 + return (floatx80)v;
  80 +}
  81 +#endif
  82 +
  83 +/*----------------------------------------------------------------------------
  84 +| Software IEC/IEEE single-precision conversion routines.
  85 +*----------------------------------------------------------------------------*/
  86 +int float32_to_int32( float32 a STATUS_PARAM)
  87 +{
  88 + return lrintf(a);
  89 +}
  90 +int float32_to_int32_round_to_zero( float32 a STATUS_PARAM)
  91 +{
  92 + return (int)a;
  93 +}
  94 +int64_t float32_to_int64( float32 a STATUS_PARAM)
  95 +{
  96 + return llrintf(a);
  97 +}
  98 +
  99 +int64_t float32_to_int64_round_to_zero( float32 a STATUS_PARAM)
  100 +{
  101 + return (int64_t)a;
  102 +}
  103 +
  104 +float64 float32_to_float64( float32 a STATUS_PARAM)
  105 +{
  106 + return a;
  107 +}
  108 +#ifdef FLOATX80
  109 +floatx80 float32_to_floatx80( float32 a STATUS_PARAM)
  110 +{
  111 + return a;
  112 +}
  113 +#endif
  114 +
  115 +/*----------------------------------------------------------------------------
  116 +| Software IEC/IEEE single-precision operations.
  117 +*----------------------------------------------------------------------------*/
  118 +float32 float32_round_to_int( float32 a STATUS_PARAM)
  119 +{
  120 + return rintf(a);
  121 +}
  122 +
  123 +float32 float32_sqrt( float32 a STATUS_PARAM)
  124 +{
  125 + return sqrtf(a);
  126 +}
  127 +char float32_is_signaling_nan( float32 a1)
  128 +{
  129 + float32u u;
  130 + uint32_t a;
  131 + u.f = a1;
  132 + a = u.i;
  133 + return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
  134 +}
  135 +
  136 +/*----------------------------------------------------------------------------
  137 +| Software IEC/IEEE double-precision conversion routines.
  138 +*----------------------------------------------------------------------------*/
  139 +int float64_to_int32( float64 a STATUS_PARAM)
  140 +{
  141 + return lrint(a);
  142 +}
  143 +int float64_to_int32_round_to_zero( float64 a STATUS_PARAM)
  144 +{
  145 + return (int)a;
  146 +}
  147 +int64_t float64_to_int64( float64 a STATUS_PARAM)
  148 +{
  149 + return llrint(a);
  150 +}
  151 +int64_t float64_to_int64_round_to_zero( float64 a STATUS_PARAM)
  152 +{
  153 + return (int64_t)a;
  154 +}
  155 +float32 float64_to_float32( float64 a STATUS_PARAM)
  156 +{
  157 + return a;
  158 +}
  159 +#ifdef FLOATX80
  160 +floatx80 float64_to_floatx80( float64 a STATUS_PARAM)
  161 +{
  162 + return a;
  163 +}
  164 +#endif
  165 +#ifdef FLOAT128
  166 +float128 float64_to_float128( float64 a STATUS_PARAM)
  167 +{
  168 + return a;
  169 +}
  170 +#endif
  171 +
  172 +/*----------------------------------------------------------------------------
  173 +| Software IEC/IEEE double-precision operations.
  174 +*----------------------------------------------------------------------------*/
  175 +float64 float64_round_to_int( float64 a STATUS_PARAM )
  176 +{
  177 +#if defined(__arm__)
  178 + switch(STATUS(float_rounding_mode)) {
  179 + default:
  180 + case float_round_nearest_even:
  181 + asm("rndd %0, %1" : "=f" (a) : "f"(a));
  182 + break;
  183 + case float_round_down:
  184 + asm("rnddm %0, %1" : "=f" (a) : "f"(a));
  185 + break;
  186 + case float_round_up:
  187 + asm("rnddp %0, %1" : "=f" (a) : "f"(a));
  188 + break;
  189 + case float_round_to_zero:
  190 + asm("rnddz %0, %1" : "=f" (a) : "f"(a));
  191 + break;
  192 + }
  193 +#else
  194 + return rint(a);
  195 +#endif
  196 +}
  197 +
  198 +float64 float64_sqrt( float64 a STATUS_PARAM)
  199 +{
  200 + return sqrt(a);
  201 +}
  202 +char float64_is_signaling_nan( float64 a1)
  203 +{
  204 + float64u u;
  205 + uint64_t a;
  206 + u.f = a1;
  207 + a = u.i;
  208 + return
  209 + ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
  210 + && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
  211 +
  212 +}
  213 +
  214 +#ifdef FLOATX80
  215 +
  216 +/*----------------------------------------------------------------------------
  217 +| Software IEC/IEEE extended double-precision conversion routines.
  218 +*----------------------------------------------------------------------------*/
  219 +int floatx80_to_int32( floatx80 a STATUS_PARAM)
  220 +{
  221 + return lrintl(a);
  222 +}
  223 +int floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM)
  224 +{
  225 + return (int)a;
  226 +}
  227 +int64_t floatx80_to_int64( floatx80 a STATUS_PARAM)
  228 +{
  229 + return llrintl(a);
  230 +}
  231 +int64_t floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM)
  232 +{
  233 + return (int64_t)a;
  234 +}
  235 +float32 floatx80_to_float32( floatx80 a STATUS_PARAM)
  236 +{
  237 + return a;
  238 +}
  239 +float64 floatx80_to_float64( floatx80 a STATUS_PARAM)
  240 +{
  241 + return a;
  242 +}
  243 +
  244 +/*----------------------------------------------------------------------------
  245 +| Software IEC/IEEE extended double-precision operations.
  246 +*----------------------------------------------------------------------------*/
  247 +floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM)
  248 +{
  249 + return rintl(a);
  250 +}
  251 +floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM)
  252 +{
  253 + return sqrtl(a);
  254 +}
  255 +char floatx80_is_signaling_nan( floatx80 a1)
  256 +{
  257 + floatx80u u;
  258 + u.f = a1;
  259 + return ( ( u.i.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( u.i.low<<1 );
  260 +}
  261 +
  262 +#endif
... ...
fpu/softfloat-native.h 0 → 100644
  1 +/* Native implementation of soft float functions */
  2 +#include <math.h>
  3 +#if defined(_BSD) && !defined(__APPLE__)
  4 +#include <ieeefp.h>
  5 +#else
  6 +#include <fenv.h>
  7 +#endif
  8 +
  9 +typedef float float32;
  10 +typedef double float64;
  11 +#ifdef FLOATX80
  12 +typedef long double floatx80;
  13 +#endif
  14 +
  15 +typedef union {
  16 + float32 f;
  17 + uint32_t i;
  18 +} float32u;
  19 +typedef union {
  20 + float64 f;
  21 + uint64_t i;
  22 +} float64u;
  23 +#ifdef FLOATX80
  24 +typedef union {
  25 + floatx80 f;
  26 + struct {
  27 + uint64_t low;
  28 + uint16_t high;
  29 + } i;
  30 +} floatx80u;
  31 +#endif
  32 +
  33 +/*----------------------------------------------------------------------------
  34 +| Software IEC/IEEE floating-point rounding mode.
  35 +*----------------------------------------------------------------------------*/
  36 +#if defined(_BSD) && !defined(__APPLE__)
  37 +enum {
  38 + float_round_nearest_even = FP_RN,
  39 + float_round_down = FE_RM,
  40 + float_round_up = FE_RP,
  41 + float_round_to_zero = FE_RZ
  42 +};
  43 +#elif defined(__arm__)
  44 +enum {
  45 + float_round_nearest_even = 0,
  46 + float_round_down = 1,
  47 + float_round_up = 2,
  48 + float_round_to_zero = 3
  49 +};
  50 +#else
  51 +enum {
  52 + float_round_nearest_even = FE_TONEAREST,
  53 + float_round_down = FE_DOWNWARD,
  54 + float_round_up = FE_UPWARD,
  55 + float_round_to_zero = FE_TOWARDZERO
  56 +};
  57 +#endif
  58 +
  59 +typedef struct float_status {
  60 + signed char float_rounding_mode;
  61 +#ifdef FLOATX80
  62 + signed char floatx80_rounding_precision;
  63 +#endif
  64 +} float_status;
  65 +
  66 +void set_float_rounding_mode(int val STATUS_PARAM);
  67 +#ifdef FLOATX80
  68 +void set_floatx80_rounding_precision(int val STATUS_PARAM);
  69 +#endif
  70 +
  71 +/*----------------------------------------------------------------------------
  72 +| Software IEC/IEEE integer-to-floating-point conversion routines.
  73 +*----------------------------------------------------------------------------*/
  74 +float32 int32_to_float32( int STATUS_PARAM);
  75 +float64 int32_to_float64( int STATUS_PARAM);
  76 +#ifdef FLOATX80
  77 +floatx80 int32_to_floatx80( int STATUS_PARAM);
  78 +#endif
  79 +#ifdef FLOAT128
  80 +float128 int32_to_float128( int STATUS_PARAM);
  81 +#endif
  82 +float32 int64_to_float32( int64_t STATUS_PARAM);
  83 +float64 int64_to_float64( int64_t STATUS_PARAM);
  84 +#ifdef FLOATX80
  85 +floatx80 int64_to_floatx80( int64_t STATUS_PARAM);
  86 +#endif
  87 +#ifdef FLOAT128
  88 +float128 int64_to_float128( int64_t STATUS_PARAM);
  89 +#endif
  90 +
  91 +/*----------------------------------------------------------------------------
  92 +| Software IEC/IEEE single-precision conversion routines.
  93 +*----------------------------------------------------------------------------*/
  94 +int float32_to_int32( float32 STATUS_PARAM);
  95 +int float32_to_int32_round_to_zero( float32 STATUS_PARAM);
  96 +int64_t float32_to_int64( float32 STATUS_PARAM);
  97 +int64_t float32_to_int64_round_to_zero( float32 STATUS_PARAM);
  98 +float64 float32_to_float64( float32 STATUS_PARAM);
  99 +#ifdef FLOATX80
  100 +floatx80 float32_to_floatx80( float32 STATUS_PARAM);
  101 +#endif
  102 +#ifdef FLOAT128
  103 +float128 float32_to_float128( float32 STATUS_PARAM);
  104 +#endif
  105 +
  106 +/*----------------------------------------------------------------------------
  107 +| Software IEC/IEEE single-precision operations.
  108 +*----------------------------------------------------------------------------*/
  109 +float32 float32_round_to_int( float32 STATUS_PARAM);
  110 +INLINE float32 float32_add( float32 a, float32 b STATUS_PARAM)
  111 +{
  112 + return a + b;
  113 +}
  114 +INLINE float32 float32_sub( float32 a, float32 b STATUS_PARAM)
  115 +{
  116 + return a - b;
  117 +}
  118 +INLINE float32 float32_mul( float32 a, float32 b STATUS_PARAM)
  119 +{
  120 + return a * b;
  121 +}
  122 +INLINE float32 float32_div( float32 a, float32 b STATUS_PARAM)
  123 +{
  124 + return a / b;
  125 +}
  126 +float32 float32_rem( float32, float32 STATUS_PARAM);
  127 +float32 float32_sqrt( float32 STATUS_PARAM);
  128 +INLINE char float32_eq( float32 a, float32 b STATUS_PARAM)
  129 +{
  130 + /* XXX: incorrect because it can raise an exception */
  131 + return a == b;
  132 +}
  133 +INLINE char float32_le( float32 a, float32 b STATUS_PARAM)
  134 +{
  135 + return a <= b;
  136 +}
  137 +INLINE char float32_lt( float32 a, float32 b STATUS_PARAM)
  138 +{
  139 + return a < b;
  140 +}
  141 +INLINE char float32_eq_signaling( float32 a, float32 b STATUS_PARAM)
  142 +{
  143 + return a == b;
  144 +}
  145 +INLINE char float32_le_quiet( float32 a, float32 b STATUS_PARAM)
  146 +{
  147 + return islessequal(a, b);
  148 +}
  149 +INLINE char float32_lt_quiet( float32 a, float32 b STATUS_PARAM)
  150 +{
  151 + return isless(a, b);
  152 +}
  153 +char float32_is_signaling_nan( float32 );
  154 +
  155 +INLINE float32 float32_abs(float32 a)
  156 +{
  157 + return fabsf(a);
  158 +}
  159 +
  160 +INLINE float32 float32_chs(float32 a)
  161 +{
  162 + return -a;
  163 +}
  164 +
  165 +/*----------------------------------------------------------------------------
  166 +| Software IEC/IEEE double-precision conversion routines.
  167 +*----------------------------------------------------------------------------*/
  168 +int float64_to_int32( float64 STATUS_PARAM );
  169 +int float64_to_int32_round_to_zero( float64 STATUS_PARAM );
  170 +int64_t float64_to_int64( float64 STATUS_PARAM );
  171 +int64_t float64_to_int64_round_to_zero( float64 STATUS_PARAM );
  172 +float32 float64_to_float32( float64 STATUS_PARAM );
  173 +#ifdef FLOATX80
  174 +floatx80 float64_to_floatx80( float64 STATUS_PARAM );
  175 +#endif
  176 +#ifdef FLOAT128
  177 +float128 float64_to_float128( float64 STATUS_PARAM );
  178 +#endif
  179 +
  180 +/*----------------------------------------------------------------------------
  181 +| Software IEC/IEEE double-precision operations.
  182 +*----------------------------------------------------------------------------*/
  183 +float64 float64_round_to_int( float64 STATUS_PARAM );
  184 +INLINE float64 float64_add( float64 a, float64 b STATUS_PARAM)
  185 +{
  186 + return a + b;
  187 +}
  188 +INLINE float64 float64_sub( float64 a, float64 b STATUS_PARAM)
  189 +{
  190 + return a - b;
  191 +}
  192 +INLINE float64 float64_mul( float64 a, float64 b STATUS_PARAM)
  193 +{
  194 + return a * b;
  195 +}
  196 +INLINE float64 float64_div( float64 a, float64 b STATUS_PARAM)
  197 +{
  198 + return a / b;
  199 +}
  200 +float64 float64_rem( float64, float64 STATUS_PARAM );
  201 +float64 float64_sqrt( float64 STATUS_PARAM );
  202 +INLINE char float64_eq( float64 a, float64 b STATUS_PARAM)
  203 +{
  204 + return a == b;
  205 +}
  206 +INLINE char float64_le( float64 a, float64 b STATUS_PARAM)
  207 +{
  208 + return a <= b;
  209 +}
  210 +INLINE char float64_lt( float64 a, float64 b STATUS_PARAM)
  211 +{
  212 + return a < b;
  213 +}
  214 +INLINE char float64_eq_signaling( float64 a, float64 b STATUS_PARAM)
  215 +{
  216 + return a == b;
  217 +}
  218 +INLINE char float64_le_quiet( float64 a, float64 b STATUS_PARAM)
  219 +{
  220 + return islessequal(a, b);
  221 +}
  222 +INLINE char float64_lt_quiet( float64 a, float64 b STATUS_PARAM)
  223 +{
  224 + return isless(a, b);
  225 +
  226 +}
  227 +char float64_is_signaling_nan( float64 );
  228 +
  229 +INLINE float64 float64_abs(float64 a)
  230 +{
  231 + return fabs(a);
  232 +}
  233 +
  234 +INLINE float64 float64_chs(float64 a)
  235 +{
  236 + return -a;
  237 +}
  238 +
  239 +#ifdef FLOATX80
  240 +
  241 +/*----------------------------------------------------------------------------
  242 +| Software IEC/IEEE extended double-precision conversion routines.
  243 +*----------------------------------------------------------------------------*/
  244 +int floatx80_to_int32( floatx80 STATUS_PARAM );
  245 +int floatx80_to_int32_round_to_zero( floatx80 STATUS_PARAM );
  246 +int64_t floatx80_to_int64( floatx80 STATUS_PARAM);
  247 +int64_t floatx80_to_int64_round_to_zero( floatx80 STATUS_PARAM);
  248 +float32 floatx80_to_float32( floatx80 STATUS_PARAM );
  249 +float64 floatx80_to_float64( floatx80 STATUS_PARAM );
  250 +#ifdef FLOAT128
  251 +float128 floatx80_to_float128( floatx80 STATUS_PARAM );
  252 +#endif
  253 +
  254 +/*----------------------------------------------------------------------------
  255 +| Software IEC/IEEE extended double-precision operations.
  256 +*----------------------------------------------------------------------------*/
  257 +floatx80 floatx80_round_to_int( floatx80 STATUS_PARAM );
  258 +INLINE floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM)
  259 +{
  260 + return a + b;
  261 +}
  262 +INLINE floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM)
  263 +{
  264 + return a - b;
  265 +}
  266 +INLINE floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM)
  267 +{
  268 + return a * b;
  269 +}
  270 +INLINE floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM)
  271 +{
  272 + return a / b;
  273 +}
  274 +floatx80 floatx80_rem( floatx80, floatx80 STATUS_PARAM );
  275 +floatx80 floatx80_sqrt( floatx80 STATUS_PARAM );
  276 +INLINE char floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM)
  277 +{
  278 + return a == b;
  279 +}
  280 +INLINE char floatx80_le( floatx80 a, floatx80 b STATUS_PARAM)
  281 +{
  282 + return a <= b;
  283 +}
  284 +INLINE char floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM)
  285 +{
  286 + return a < b;
  287 +}
  288 +INLINE char floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM)
  289 +{
  290 + return a == b;
  291 +}
  292 +INLINE char floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM)
  293 +{
  294 + return islessequal(a, b);
  295 +}
  296 +INLINE char floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM)
  297 +{
  298 + return isless(a, b);
  299 +
  300 +}
  301 +char floatx80_is_signaling_nan( floatx80 );
  302 +
  303 +INLINE floatx80 floatx80_abs(floatx80 a)
  304 +{
  305 + return fabsl(a);
  306 +}
  307 +
  308 +INLINE floatx80 floatx80_chs(floatx80 a)
  309 +{
  310 + return -a;
  311 +}
  312 +#endif
... ...
fpu/softfloat-specialize.h 0 → 100644
  1 +
  2 +/*============================================================================
  3 +
  4 +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  5 +Arithmetic Package, Release 2b.
  6 +
  7 +Written by John R. Hauser. This work was made possible in part by the
  8 +International Computer Science Institute, located at Suite 600, 1947 Center
  9 +Street, Berkeley, California 94704. Funding was partially provided by the
  10 +National Science Foundation under grant MIP-9311980. The original version
  11 +of this code was written as part of a project to build a fixed-point vector
  12 +processor in collaboration with the University of California at Berkeley,
  13 +overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  14 +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 +arithmetic/SoftFloat.html'.
  16 +
  17 +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  18 +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25 +
  26 +Derivative works are acceptable, even for commercial purposes, so long as
  27 +(1) the source code for the derivative work includes prominent notice that
  28 +the work is derivative, and (2) the source code includes prominent notice with
  29 +these four paragraphs for those parts of this code that are retained.
  30 +
  31 +=============================================================================*/
  32 +
  33 +/*----------------------------------------------------------------------------
  34 +| Underflow tininess-detection mode, statically initialized to default value.
  35 +| (The declaration in `softfloat.h' must match the `int8' type here.)
  36 +*----------------------------------------------------------------------------*/
  37 +int8 float_detect_tininess = float_tininess_after_rounding;
  38 +
  39 +/*----------------------------------------------------------------------------
  40 +| Raises the exceptions specified by `flags'. Floating-point traps can be
  41 +| defined here if desired. It is currently not possible for such a trap
  42 +| to substitute a result value. If traps are not implemented, this routine
  43 +| should be simply `float_exception_flags |= flags;'.
  44 +*----------------------------------------------------------------------------*/
  45 +
  46 +void float_raise( int8 flags STATUS_PARAM )
  47 +{
  48 +
  49 + STATUS(float_exception_flags) |= flags;
  50 +
  51 +}
  52 +
  53 +/*----------------------------------------------------------------------------
  54 +| Internal canonical NaN format.
  55 +*----------------------------------------------------------------------------*/
  56 +typedef struct {
  57 + flag sign;
  58 + bits64 high, low;
  59 +} commonNaNT;
  60 +
  61 +/*----------------------------------------------------------------------------
  62 +| The pattern for a default generated single-precision NaN.
  63 +*----------------------------------------------------------------------------*/
  64 +#define float32_default_nan 0xFFC00000
  65 +
  66 +/*----------------------------------------------------------------------------
  67 +| Returns 1 if the single-precision floating-point value `a' is a NaN;
  68 +| otherwise returns 0.
  69 +*----------------------------------------------------------------------------*/
  70 +
  71 +flag float32_is_nan( float32 a )
  72 +{
  73 +
  74 + return ( 0xFF000000 < (bits32) ( a<<1 ) );
  75 +
  76 +}
  77 +
  78 +/*----------------------------------------------------------------------------
  79 +| Returns 1 if the single-precision floating-point value `a' is a signaling
  80 +| NaN; otherwise returns 0.
  81 +*----------------------------------------------------------------------------*/
  82 +
  83 +flag float32_is_signaling_nan( float32 a )
  84 +{
  85 +
  86 + return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
  87 +
  88 +}
  89 +
  90 +/*----------------------------------------------------------------------------
  91 +| Returns the result of converting the single-precision floating-point NaN
  92 +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  93 +| exception is raised.
  94 +*----------------------------------------------------------------------------*/
  95 +
  96 +static commonNaNT float32ToCommonNaN( float32 a STATUS_PARAM )
  97 +{
  98 + commonNaNT z;
  99 +
  100 + if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR );
  101 + z.sign = a>>31;
  102 + z.low = 0;
  103 + z.high = ( (bits64) a )<<41;
  104 + return z;
  105 +
  106 +}
  107 +
  108 +/*----------------------------------------------------------------------------
  109 +| Returns the result of converting the canonical NaN `a' to the single-
  110 +| precision floating-point format.
  111 +*----------------------------------------------------------------------------*/
  112 +
  113 +static float32 commonNaNToFloat32( commonNaNT a )
  114 +{
  115 +
  116 + return ( ( (bits32) a.sign )<<31 ) | 0x7FC00000 | ( a.high>>41 );
  117 +
  118 +}
  119 +
  120 +/*----------------------------------------------------------------------------
  121 +| Takes two single-precision floating-point values `a' and `b', one of which
  122 +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  123 +| signaling NaN, the invalid exception is raised.
  124 +*----------------------------------------------------------------------------*/
  125 +
  126 +static float32 propagateFloat32NaN( float32 a, float32 b STATUS_PARAM)
  127 +{
  128 + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
  129 +
  130 + aIsNaN = float32_is_nan( a );
  131 + aIsSignalingNaN = float32_is_signaling_nan( a );
  132 + bIsNaN = float32_is_nan( b );
  133 + bIsSignalingNaN = float32_is_signaling_nan( b );
  134 + a |= 0x00400000;
  135 + b |= 0x00400000;
  136 + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
  137 + if ( aIsSignalingNaN ) {
  138 + if ( bIsSignalingNaN ) goto returnLargerSignificand;
  139 + return bIsNaN ? b : a;
  140 + }
  141 + else if ( aIsNaN ) {
  142 + if ( bIsSignalingNaN | ! bIsNaN ) return a;
  143 + returnLargerSignificand:
  144 + if ( (bits32) ( a<<1 ) < (bits32) ( b<<1 ) ) return b;
  145 + if ( (bits32) ( b<<1 ) < (bits32) ( a<<1 ) ) return a;
  146 + return ( a < b ) ? a : b;
  147 + }
  148 + else {
  149 + return b;
  150 + }
  151 +
  152 +}
  153 +
  154 +/*----------------------------------------------------------------------------
  155 +| The pattern for a default generated double-precision NaN.
  156 +*----------------------------------------------------------------------------*/
  157 +#define float64_default_nan LIT64( 0xFFF8000000000000 )
  158 +
  159 +/*----------------------------------------------------------------------------
  160 +| Returns 1 if the double-precision floating-point value `a' is a NaN;
  161 +| otherwise returns 0.
  162 +*----------------------------------------------------------------------------*/
  163 +
  164 +flag float64_is_nan( float64 a )
  165 +{
  166 +
  167 + return ( LIT64( 0xFFE0000000000000 ) < (bits64) ( a<<1 ) );
  168 +
  169 +}
  170 +
  171 +/*----------------------------------------------------------------------------
  172 +| Returns 1 if the double-precision floating-point value `a' is a signaling
  173 +| NaN; otherwise returns 0.
  174 +*----------------------------------------------------------------------------*/
  175 +
  176 +flag float64_is_signaling_nan( float64 a )
  177 +{
  178 +
  179 + return
  180 + ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
  181 + && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
  182 +
  183 +}
  184 +
  185 +/*----------------------------------------------------------------------------
  186 +| Returns the result of converting the double-precision floating-point NaN
  187 +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  188 +| exception is raised.
  189 +*----------------------------------------------------------------------------*/
  190 +
  191 +static commonNaNT float64ToCommonNaN( float64 a STATUS_PARAM)
  192 +{
  193 + commonNaNT z;
  194 +
  195 + if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR);
  196 + z.sign = a>>63;
  197 + z.low = 0;
  198 + z.high = a<<12;
  199 + return z;
  200 +
  201 +}
  202 +
  203 +/*----------------------------------------------------------------------------
  204 +| Returns the result of converting the canonical NaN `a' to the double-
  205 +| precision floating-point format.
  206 +*----------------------------------------------------------------------------*/
  207 +
  208 +static float64 commonNaNToFloat64( commonNaNT a )
  209 +{
  210 +
  211 + return
  212 + ( ( (bits64) a.sign )<<63 )
  213 + | LIT64( 0x7FF8000000000000 )
  214 + | ( a.high>>12 );
  215 +
  216 +}
  217 +
  218 +/*----------------------------------------------------------------------------
  219 +| Takes two double-precision floating-point values `a' and `b', one of which
  220 +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
  221 +| signaling NaN, the invalid exception is raised.
  222 +*----------------------------------------------------------------------------*/
  223 +
  224 +static float64 propagateFloat64NaN( float64 a, float64 b STATUS_PARAM)
  225 +{
  226 + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
  227 +
  228 + aIsNaN = float64_is_nan( a );
  229 + aIsSignalingNaN = float64_is_signaling_nan( a );
  230 + bIsNaN = float64_is_nan( b );
  231 + bIsSignalingNaN = float64_is_signaling_nan( b );
  232 + a |= LIT64( 0x0008000000000000 );
  233 + b |= LIT64( 0x0008000000000000 );
  234 + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
  235 + if ( aIsSignalingNaN ) {
  236 + if ( bIsSignalingNaN ) goto returnLargerSignificand;
  237 + return bIsNaN ? b : a;
  238 + }
  239 + else if ( aIsNaN ) {
  240 + if ( bIsSignalingNaN | ! bIsNaN ) return a;
  241 + returnLargerSignificand:
  242 + if ( (bits64) ( a<<1 ) < (bits64) ( b<<1 ) ) return b;
  243 + if ( (bits64) ( b<<1 ) < (bits64) ( a<<1 ) ) return a;
  244 + return ( a < b ) ? a : b;
  245 + }
  246 + else {
  247 + return b;
  248 + }
  249 +
  250 +}
  251 +
  252 +#ifdef FLOATX80
  253 +
  254 +/*----------------------------------------------------------------------------
  255 +| The pattern for a default generated extended double-precision NaN. The
  256 +| `high' and `low' values hold the most- and least-significant bits,
  257 +| respectively.
  258 +*----------------------------------------------------------------------------*/
  259 +#define floatx80_default_nan_high 0xFFFF
  260 +#define floatx80_default_nan_low LIT64( 0xC000000000000000 )
  261 +
  262 +/*----------------------------------------------------------------------------
  263 +| Returns 1 if the extended double-precision floating-point value `a' is a
  264 +| NaN; otherwise returns 0.
  265 +*----------------------------------------------------------------------------*/
  266 +
  267 +flag floatx80_is_nan( floatx80 a )
  268 +{
  269 +
  270 + return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( a.low<<1 );
  271 +
  272 +}
  273 +
  274 +/*----------------------------------------------------------------------------
  275 +| Returns 1 if the extended double-precision floating-point value `a' is a
  276 +| signaling NaN; otherwise returns 0.
  277 +*----------------------------------------------------------------------------*/
  278 +
  279 +flag floatx80_is_signaling_nan( floatx80 a )
  280 +{
  281 + bits64 aLow;
  282 +
  283 + aLow = a.low & ~ LIT64( 0x4000000000000000 );
  284 + return
  285 + ( ( a.high & 0x7FFF ) == 0x7FFF )
  286 + && (bits64) ( aLow<<1 )
  287 + && ( a.low == aLow );
  288 +
  289 +}
  290 +
  291 +/*----------------------------------------------------------------------------
  292 +| Returns the result of converting the extended double-precision floating-
  293 +| point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
  294 +| invalid exception is raised.
  295 +*----------------------------------------------------------------------------*/
  296 +
  297 +static commonNaNT floatx80ToCommonNaN( floatx80 a STATUS_PARAM)
  298 +{
  299 + commonNaNT z;
  300 +
  301 + if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR);
  302 + z.sign = a.high>>15;
  303 + z.low = 0;
  304 + z.high = a.low<<1;
  305 + return z;
  306 +
  307 +}
  308 +
  309 +/*----------------------------------------------------------------------------
  310 +| Returns the result of converting the canonical NaN `a' to the extended
  311 +| double-precision floating-point format.
  312 +*----------------------------------------------------------------------------*/
  313 +
  314 +static floatx80 commonNaNToFloatx80( commonNaNT a )
  315 +{
  316 + floatx80 z;
  317 +
  318 + z.low = LIT64( 0xC000000000000000 ) | ( a.high>>1 );
  319 + z.high = ( ( (bits16) a.sign )<<15 ) | 0x7FFF;
  320 + return z;
  321 +
  322 +}
  323 +
  324 +/*----------------------------------------------------------------------------
  325 +| Takes two extended double-precision floating-point values `a' and `b', one
  326 +| of which is a NaN, and returns the appropriate NaN result. If either `a' or
  327 +| `b' is a signaling NaN, the invalid exception is raised.
  328 +*----------------------------------------------------------------------------*/
  329 +
  330 +static floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b STATUS_PARAM)
  331 +{
  332 + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
  333 +
  334 + aIsNaN = floatx80_is_nan( a );
  335 + aIsSignalingNaN = floatx80_is_signaling_nan( a );
  336 + bIsNaN = floatx80_is_nan( b );
  337 + bIsSignalingNaN = floatx80_is_signaling_nan( b );
  338 + a.low |= LIT64( 0xC000000000000000 );
  339 + b.low |= LIT64( 0xC000000000000000 );
  340 + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
  341 + if ( aIsSignalingNaN ) {
  342 + if ( bIsSignalingNaN ) goto returnLargerSignificand;
  343 + return bIsNaN ? b : a;
  344 + }
  345 + else if ( aIsNaN ) {
  346 + if ( bIsSignalingNaN | ! bIsNaN ) return a;
  347 + returnLargerSignificand:
  348 + if ( a.low < b.low ) return b;
  349 + if ( b.low < a.low ) return a;
  350 + return ( a.high < b.high ) ? a : b;
  351 + }
  352 + else {
  353 + return b;
  354 + }
  355 +
  356 +}
  357 +
  358 +#endif
  359 +
  360 +#ifdef FLOAT128
  361 +
  362 +/*----------------------------------------------------------------------------
  363 +| The pattern for a default generated quadruple-precision NaN. The `high' and
  364 +| `low' values hold the most- and least-significant bits, respectively.
  365 +*----------------------------------------------------------------------------*/
  366 +#define float128_default_nan_high LIT64( 0xFFFF800000000000 )
  367 +#define float128_default_nan_low LIT64( 0x0000000000000000 )
  368 +
  369 +/*----------------------------------------------------------------------------
  370 +| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
  371 +| otherwise returns 0.
  372 +*----------------------------------------------------------------------------*/
  373 +
  374 +flag float128_is_nan( float128 a )
  375 +{
  376 +
  377 + return
  378 + ( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) )
  379 + && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
  380 +
  381 +}
  382 +
  383 +/*----------------------------------------------------------------------------
  384 +| Returns 1 if the quadruple-precision floating-point value `a' is a
  385 +| signaling NaN; otherwise returns 0.
  386 +*----------------------------------------------------------------------------*/
  387 +
  388 +flag float128_is_signaling_nan( float128 a )
  389 +{
  390 +
  391 + return
  392 + ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
  393 + && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
  394 +
  395 +}
  396 +
  397 +/*----------------------------------------------------------------------------
  398 +| Returns the result of converting the quadruple-precision floating-point NaN
  399 +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
  400 +| exception is raised.
  401 +*----------------------------------------------------------------------------*/
  402 +
  403 +static commonNaNT float128ToCommonNaN( float128 a STATUS_PARAM)
  404 +{
  405 + commonNaNT z;
  406 +
  407 + if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid STATUS_VAR);
  408 + z.sign = a.high>>63;
  409 + shortShift128Left( a.high, a.low, 16, &z.high, &z.low );
  410 + return z;
  411 +
  412 +}
  413 +
  414 +/*----------------------------------------------------------------------------
  415 +| Returns the result of converting the canonical NaN `a' to the quadruple-
  416 +| precision floating-point format.
  417 +*----------------------------------------------------------------------------*/
  418 +
  419 +static float128 commonNaNToFloat128( commonNaNT a )
  420 +{
  421 + float128 z;
  422 +
  423 + shift128Right( a.high, a.low, 16, &z.high, &z.low );
  424 + z.high |= ( ( (bits64) a.sign )<<63 ) | LIT64( 0x7FFF800000000000 );
  425 + return z;
  426 +
  427 +}
  428 +
  429 +/*----------------------------------------------------------------------------
  430 +| Takes two quadruple-precision floating-point values `a' and `b', one of
  431 +| which is a NaN, and returns the appropriate NaN result. If either `a' or
  432 +| `b' is a signaling NaN, the invalid exception is raised.
  433 +*----------------------------------------------------------------------------*/
  434 +
  435 +static float128 propagateFloat128NaN( float128 a, float128 b STATUS_PARAM)
  436 +{
  437 + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
  438 +
  439 + aIsNaN = float128_is_nan( a );
  440 + aIsSignalingNaN = float128_is_signaling_nan( a );
  441 + bIsNaN = float128_is_nan( b );
  442 + bIsSignalingNaN = float128_is_signaling_nan( b );
  443 + a.high |= LIT64( 0x0000800000000000 );
  444 + b.high |= LIT64( 0x0000800000000000 );
  445 + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid STATUS_VAR);
  446 + if ( aIsSignalingNaN ) {
  447 + if ( bIsSignalingNaN ) goto returnLargerSignificand;
  448 + return bIsNaN ? b : a;
  449 + }
  450 + else if ( aIsNaN ) {
  451 + if ( bIsSignalingNaN | ! bIsNaN ) return a;
  452 + returnLargerSignificand:
  453 + if ( lt128( a.high<<1, a.low, b.high<<1, b.low ) ) return b;
  454 + if ( lt128( b.high<<1, b.low, a.high<<1, a.low ) ) return a;
  455 + return ( a.high < b.high ) ? a : b;
  456 + }
  457 + else {
  458 + return b;
  459 + }
  460 +
  461 +}
  462 +
  463 +#endif
  464 +
... ...
fpu/softfloat.c 0 → 100644
  1 +
  2 +/*============================================================================
  3 +
  4 +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  5 +Package, Release 2b.
  6 +
  7 +Written by John R. Hauser. This work was made possible in part by the
  8 +International Computer Science Institute, located at Suite 600, 1947 Center
  9 +Street, Berkeley, California 94704. Funding was partially provided by the
  10 +National Science Foundation under grant MIP-9311980. The original version
  11 +of this code was written as part of a project to build a fixed-point vector
  12 +processor in collaboration with the University of California at Berkeley,
  13 +overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  14 +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 +arithmetic/SoftFloat.html'.
  16 +
  17 +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  18 +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25 +
  26 +Derivative works are acceptable, even for commercial purposes, so long as
  27 +(1) the source code for the derivative work includes prominent notice that
  28 +the work is derivative, and (2) the source code includes prominent notice with
  29 +these four paragraphs for those parts of this code that are retained.
  30 +
  31 +=============================================================================*/
  32 +
  33 +#include "softfloat.h"
  34 +
  35 +/*----------------------------------------------------------------------------
  36 +| Primitive arithmetic functions, including multi-word arithmetic, and
  37 +| division and square root approximations. (Can be specialized to target if
  38 +| desired.)
  39 +*----------------------------------------------------------------------------*/
  40 +#include "softfloat-macros.h"
  41 +
  42 +/*----------------------------------------------------------------------------
  43 +| Functions and definitions to determine: (1) whether tininess for underflow
  44 +| is detected before or after rounding by default, (2) what (if anything)
  45 +| happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 +| are propagated from function inputs to output. These details are target-
  48 +| specific.
  49 +*----------------------------------------------------------------------------*/
  50 +#include "softfloat-specialize.h"
  51 +
  52 +void set_float_rounding_mode(int val STATUS_PARAM)
  53 +{
  54 + STATUS(float_rounding_mode) = val;
  55 +}
  56 +
  57 +#ifdef FLOATX80
  58 +void set_floatx80_rounding_precision(int val STATUS_PARAM)
  59 +{
  60 + STATUS(floatx80_rounding_precision) = val;
  61 +}
  62 +#endif
  63 +
  64 +/*----------------------------------------------------------------------------
  65 +| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  66 +| and 7, and returns the properly rounded 32-bit integer corresponding to the
  67 +| input. If `zSign' is 1, the input is negated before being converted to an
  68 +| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  69 +| is simply rounded to an integer, with the inexact exception raised if the
  70 +| input cannot be represented exactly as an integer. However, if the fixed-
  71 +| point input is too large, the invalid exception is raised and the largest
  72 +| positive or negative integer is returned.
  73 +*----------------------------------------------------------------------------*/
  74 +
  75 +static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  76 +{
  77 + int8 roundingMode;
  78 + flag roundNearestEven;
  79 + int8 roundIncrement, roundBits;
  80 + int32 z;
  81 +
  82 + roundingMode = STATUS(float_rounding_mode);
  83 + roundNearestEven = ( roundingMode == float_round_nearest_even );
  84 + roundIncrement = 0x40;
  85 + if ( ! roundNearestEven ) {
  86 + if ( roundingMode == float_round_to_zero ) {
  87 + roundIncrement = 0;
  88 + }
  89 + else {
  90 + roundIncrement = 0x7F;
  91 + if ( zSign ) {
  92 + if ( roundingMode == float_round_up ) roundIncrement = 0;
  93 + }
  94 + else {
  95 + if ( roundingMode == float_round_down ) roundIncrement = 0;
  96 + }
  97 + }
  98 + }
  99 + roundBits = absZ & 0x7F;
  100 + absZ = ( absZ + roundIncrement )>>7;
  101 + absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  102 + z = absZ;
  103 + if ( zSign ) z = - z;
  104 + if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
  105 + float_raise( float_flag_invalid STATUS_VAR);
  106 + return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
  107 + }
  108 + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
  109 + return z;
  110 +
  111 +}
  112 +
  113 +/*----------------------------------------------------------------------------
  114 +| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  115 +| `absZ1', with binary point between bits 63 and 64 (between the input words),
  116 +| and returns the properly rounded 64-bit integer corresponding to the input.
  117 +| If `zSign' is 1, the input is negated before being converted to an integer.
  118 +| Ordinarily, the fixed-point input is simply rounded to an integer, with
  119 +| the inexact exception raised if the input cannot be represented exactly as
  120 +| an integer. However, if the fixed-point input is too large, the invalid
  121 +| exception is raised and the largest positive or negative integer is
  122 +| returned.
  123 +*----------------------------------------------------------------------------*/
  124 +
  125 +static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
  126 +{
  127 + int8 roundingMode;
  128 + flag roundNearestEven, increment;
  129 + int64 z;
  130 +
  131 + roundingMode = STATUS(float_rounding_mode);
  132 + roundNearestEven = ( roundingMode == float_round_nearest_even );
  133 + increment = ( (sbits64) absZ1 < 0 );
  134 + if ( ! roundNearestEven ) {
  135 + if ( roundingMode == float_round_to_zero ) {
  136 + increment = 0;
  137 + }
  138 + else {
  139 + if ( zSign ) {
  140 + increment = ( roundingMode == float_round_down ) && absZ1;
  141 + }
  142 + else {
  143 + increment = ( roundingMode == float_round_up ) && absZ1;
  144 + }
  145 + }
  146 + }
  147 + if ( increment ) {
  148 + ++absZ0;
  149 + if ( absZ0 == 0 ) goto overflow;
  150 + absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
  151 + }
  152 + z = absZ0;
  153 + if ( zSign ) z = - z;
  154 + if ( z && ( ( z < 0 ) ^ zSign ) ) {
  155 + overflow:
  156 + float_raise( float_flag_invalid STATUS_VAR);
  157 + return
  158 + zSign ? (sbits64) LIT64( 0x8000000000000000 )
  159 + : LIT64( 0x7FFFFFFFFFFFFFFF );
  160 + }
  161 + if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
  162 + return z;
  163 +
  164 +}
  165 +
  166 +/*----------------------------------------------------------------------------
  167 +| Returns the fraction bits of the single-precision floating-point value `a'.
  168 +*----------------------------------------------------------------------------*/
  169 +
  170 +INLINE bits32 extractFloat32Frac( float32 a )
  171 +{
  172 +
  173 + return a & 0x007FFFFF;
  174 +
  175 +}
  176 +
  177 +/*----------------------------------------------------------------------------
  178 +| Returns the exponent bits of the single-precision floating-point value `a'.
  179 +*----------------------------------------------------------------------------*/
  180 +
  181 +INLINE int16 extractFloat32Exp( float32 a )
  182 +{
  183 +
  184 + return ( a>>23 ) & 0xFF;
  185 +
  186 +}
  187 +
  188 +/*----------------------------------------------------------------------------
  189 +| Returns the sign bit of the single-precision floating-point value `a'.
  190 +*----------------------------------------------------------------------------*/
  191 +
  192 +INLINE flag extractFloat32Sign( float32 a )
  193 +{
  194 +
  195 + return a>>31;
  196 +
  197 +}
  198 +
  199 +/*----------------------------------------------------------------------------
  200 +| Normalizes the subnormal single-precision floating-point value represented
  201 +| by the denormalized significand `aSig'. The normalized exponent and
  202 +| significand are stored at the locations pointed to by `zExpPtr' and
  203 +| `zSigPtr', respectively.
  204 +*----------------------------------------------------------------------------*/
  205 +
  206 +static void
  207 + normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
  208 +{
  209 + int8 shiftCount;
  210 +
  211 + shiftCount = countLeadingZeros32( aSig ) - 8;
  212 + *zSigPtr = aSig<<shiftCount;
  213 + *zExpPtr = 1 - shiftCount;
  214 +
  215 +}
  216 +
  217 +/*----------------------------------------------------------------------------
  218 +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  219 +| single-precision floating-point value, returning the result. After being
  220 +| shifted into the proper positions, the three fields are simply added
  221 +| together to form the result. This means that any integer portion of `zSig'
  222 +| will be added into the exponent. Since a properly normalized significand
  223 +| will have an integer portion equal to 1, the `zExp' input should be 1 less
  224 +| than the desired result exponent whenever `zSig' is a complete, normalized
  225 +| significand.
  226 +*----------------------------------------------------------------------------*/
  227 +
  228 +INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
  229 +{
  230 +
  231 + return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
  232 +
  233 +}
  234 +
  235 +/*----------------------------------------------------------------------------
  236 +| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  237 +| and significand `zSig', and returns the proper single-precision floating-
  238 +| point value corresponding to the abstract input. Ordinarily, the abstract
  239 +| value is simply rounded and packed into the single-precision format, with
  240 +| the inexact exception raised if the abstract input cannot be represented
  241 +| exactly. However, if the abstract value is too large, the overflow and
  242 +| inexact exceptions are raised and an infinity or maximal finite value is
  243 +| returned. If the abstract value is too small, the input value is rounded to
  244 +| a subnormal number, and the underflow and inexact exceptions are raised if
  245 +| the abstract input cannot be represented exactly as a subnormal single-
  246 +| precision floating-point number.
  247 +| The input significand `zSig' has its binary point between bits 30
  248 +| and 29, which is 7 bits to the left of the usual location. This shifted
  249 +| significand must be normalized or smaller. If `zSig' is not normalized,
  250 +| `zExp' must be 0; in that case, the result returned is a subnormal number,
  251 +| and it must not require rounding. In the usual case that `zSig' is
  252 +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  253 +| The handling of underflow and overflow follows the IEC/IEEE Standard for
  254 +| Binary Floating-Point Arithmetic.
  255 +*----------------------------------------------------------------------------*/
  256 +
  257 +static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
  258 +{
  259 + int8 roundingMode;
  260 + flag roundNearestEven;
  261 + int8 roundIncrement, roundBits;
  262 + flag isTiny;
  263 +
  264 + roundingMode = STATUS(float_rounding_mode);
  265 + roundNearestEven = ( roundingMode == float_round_nearest_even );
  266 + roundIncrement = 0x40;
  267 + if ( ! roundNearestEven ) {
  268 + if ( roundingMode == float_round_to_zero ) {
  269 + roundIncrement = 0;
  270 + }
  271 + else {
  272 + roundIncrement = 0x7F;
  273 + if ( zSign ) {
  274 + if ( roundingMode == float_round_up ) roundIncrement = 0;
  275 + }
  276 + else {
  277 + if ( roundingMode == float_round_down ) roundIncrement = 0;
  278 + }
  279 + }
  280 + }
  281 + roundBits = zSig & 0x7F;
  282 + if ( 0xFD <= (bits16) zExp ) {
  283 + if ( ( 0xFD < zExp )
  284 + || ( ( zExp == 0xFD )
  285 + && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
  286 + ) {
  287 + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
  288 + return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
  289 + }
  290 + if ( zExp < 0 ) {
  291 + isTiny =
  292 + ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
  293 + || ( zExp < -1 )
  294 + || ( zSig + roundIncrement < 0x80000000 );
  295 + shift32RightJamming( zSig, - zExp, &zSig );
  296 + zExp = 0;
  297 + roundBits = zSig & 0x7F;
  298 + if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
  299 + }
  300 + }
  301 + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
  302 + zSig = ( zSig + roundIncrement )>>7;
  303 + zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  304 + if ( zSig == 0 ) zExp = 0;
  305 + return packFloat32( zSign, zExp, zSig );
  306 +
  307 +}
  308 +
  309 +/*----------------------------------------------------------------------------
  310 +| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  311 +| and significand `zSig', and returns the proper single-precision floating-
  312 +| point value corresponding to the abstract input. This routine is just like
  313 +| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  314 +| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  315 +| floating-point exponent.
  316 +*----------------------------------------------------------------------------*/
  317 +
  318 +static float32
  319 + normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
  320 +{
  321 + int8 shiftCount;
  322 +
  323 + shiftCount = countLeadingZeros32( zSig ) - 1;
  324 + return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
  325 +
  326 +}
  327 +
  328 +/*----------------------------------------------------------------------------
  329 +| Returns the fraction bits of the double-precision floating-point value `a'.
  330 +*----------------------------------------------------------------------------*/
  331 +
  332 +INLINE bits64 extractFloat64Frac( float64 a )
  333 +{
  334 +
  335 + return a & LIT64( 0x000FFFFFFFFFFFFF );
  336 +
  337 +}
  338 +
  339 +/*----------------------------------------------------------------------------
  340 +| Returns the exponent bits of the double-precision floating-point value `a'.
  341 +*----------------------------------------------------------------------------*/
  342 +
  343 +INLINE int16 extractFloat64Exp( float64 a )
  344 +{
  345 +
  346 + return ( a>>52 ) & 0x7FF;
  347 +
  348 +}
  349 +
  350 +/*----------------------------------------------------------------------------
  351 +| Returns the sign bit of the double-precision floating-point value `a'.
  352 +*----------------------------------------------------------------------------*/
  353 +
  354 +INLINE flag extractFloat64Sign( float64 a )
  355 +{
  356 +
  357 + return a>>63;
  358 +
  359 +}
  360 +
  361 +/*----------------------------------------------------------------------------
  362 +| Normalizes the subnormal double-precision floating-point value represented
  363 +| by the denormalized significand `aSig'. The normalized exponent and
  364 +| significand are stored at the locations pointed to by `zExpPtr' and
  365 +| `zSigPtr', respectively.
  366 +*----------------------------------------------------------------------------*/
  367 +
  368 +static void
  369 + normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
  370 +{
  371 + int8 shiftCount;
  372 +
  373 + shiftCount = countLeadingZeros64( aSig ) - 11;
  374 + *zSigPtr = aSig<<shiftCount;
  375 + *zExpPtr = 1 - shiftCount;
  376 +
  377 +}
  378 +
  379 +/*----------------------------------------------------------------------------
  380 +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  381 +| double-precision floating-point value, returning the result. After being
  382 +| shifted into the proper positions, the three fields are simply added
  383 +| together to form the result. This means that any integer portion of `zSig'
  384 +| will be added into the exponent. Since a properly normalized significand
  385 +| will have an integer portion equal to 1, the `zExp' input should be 1 less
  386 +| than the desired result exponent whenever `zSig' is a complete, normalized
  387 +| significand.
  388 +*----------------------------------------------------------------------------*/
  389 +
  390 +INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
  391 +{
  392 +
  393 + return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
  394 +
  395 +}
  396 +
  397 +/*----------------------------------------------------------------------------
  398 +| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  399 +| and significand `zSig', and returns the proper double-precision floating-
  400 +| point value corresponding to the abstract input. Ordinarily, the abstract
  401 +| value is simply rounded and packed into the double-precision format, with
  402 +| the inexact exception raised if the abstract input cannot be represented
  403 +| exactly. However, if the abstract value is too large, the overflow and
  404 +| inexact exceptions are raised and an infinity or maximal finite value is
  405 +| returned. If the abstract value is too small, the input value is rounded
  406 +| to a subnormal number, and the underflow and inexact exceptions are raised
  407 +| if the abstract input cannot be represented exactly as a subnormal double-
  408 +| precision floating-point number.
  409 +| The input significand `zSig' has its binary point between bits 62
  410 +| and 61, which is 10 bits to the left of the usual location. This shifted
  411 +| significand must be normalized or smaller. If `zSig' is not normalized,
  412 +| `zExp' must be 0; in that case, the result returned is a subnormal number,
  413 +| and it must not require rounding. In the usual case that `zSig' is
  414 +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  415 +| The handling of underflow and overflow follows the IEC/IEEE Standard for
  416 +| Binary Floating-Point Arithmetic.
  417 +*----------------------------------------------------------------------------*/
  418 +
  419 +static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
  420 +{
  421 + int8 roundingMode;
  422 + flag roundNearestEven;
  423 + int16 roundIncrement, roundBits;
  424 + flag isTiny;
  425 +
  426 + roundingMode = STATUS(float_rounding_mode);
  427 + roundNearestEven = ( roundingMode == float_round_nearest_even );
  428 + roundIncrement = 0x200;
  429 + if ( ! roundNearestEven ) {
  430 + if ( roundingMode == float_round_to_zero ) {
  431 + roundIncrement = 0;
  432 + }
  433 + else {
  434 + roundIncrement = 0x3FF;
  435 + if ( zSign ) {
  436 + if ( roundingMode == float_round_up ) roundIncrement = 0;
  437 + }
  438 + else {
  439 + if ( roundingMode == float_round_down ) roundIncrement = 0;
  440 + }
  441 + }
  442 + }
  443 + roundBits = zSig & 0x3FF;
  444 + if ( 0x7FD <= (bits16) zExp ) {
  445 + if ( ( 0x7FD < zExp )
  446 + || ( ( zExp == 0x7FD )
  447 + && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
  448 + ) {
  449 + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
  450 + return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
  451 + }
  452 + if ( zExp < 0 ) {
  453 + isTiny =
  454 + ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
  455 + || ( zExp < -1 )
  456 + || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
  457 + shift64RightJamming( zSig, - zExp, &zSig );
  458 + zExp = 0;
  459 + roundBits = zSig & 0x3FF;
  460 + if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
  461 + }
  462 + }
  463 + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
  464 + zSig = ( zSig + roundIncrement )>>10;
  465 + zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
  466 + if ( zSig == 0 ) zExp = 0;
  467 + return packFloat64( zSign, zExp, zSig );
  468 +
  469 +}
  470 +
  471 +/*----------------------------------------------------------------------------
  472 +| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  473 +| and significand `zSig', and returns the proper double-precision floating-
  474 +| point value corresponding to the abstract input. This routine is just like
  475 +| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  476 +| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  477 +| floating-point exponent.
  478 +*----------------------------------------------------------------------------*/
  479 +
  480 +static float64
  481 + normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
  482 +{
  483 + int8 shiftCount;
  484 +
  485 + shiftCount = countLeadingZeros64( zSig ) - 1;
  486 + return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
  487 +
  488 +}
  489 +
  490 +#ifdef FLOATX80
  491 +
  492 +/*----------------------------------------------------------------------------
  493 +| Returns the fraction bits of the extended double-precision floating-point
  494 +| value `a'.
  495 +*----------------------------------------------------------------------------*/
  496 +
  497 +INLINE bits64 extractFloatx80Frac( floatx80 a )
  498 +{
  499 +
  500 + return a.low;
  501 +
  502 +}
  503 +
  504 +/*----------------------------------------------------------------------------
  505 +| Returns the exponent bits of the extended double-precision floating-point
  506 +| value `a'.
  507 +*----------------------------------------------------------------------------*/
  508 +
  509 +INLINE int32 extractFloatx80Exp( floatx80 a )
  510 +{
  511 +
  512 + return a.high & 0x7FFF;
  513 +
  514 +}
  515 +
  516 +/*----------------------------------------------------------------------------
  517 +| Returns the sign bit of the extended double-precision floating-point value
  518 +| `a'.
  519 +*----------------------------------------------------------------------------*/
  520 +
  521 +INLINE flag extractFloatx80Sign( floatx80 a )
  522 +{
  523 +
  524 + return a.high>>15;
  525 +
  526 +}
  527 +
  528 +/*----------------------------------------------------------------------------
  529 +| Normalizes the subnormal extended double-precision floating-point value
  530 +| represented by the denormalized significand `aSig'. The normalized exponent
  531 +| and significand are stored at the locations pointed to by `zExpPtr' and
  532 +| `zSigPtr', respectively.
  533 +*----------------------------------------------------------------------------*/
  534 +
  535 +static void
  536 + normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
  537 +{
  538 + int8 shiftCount;
  539 +
  540 + shiftCount = countLeadingZeros64( aSig );
  541 + *zSigPtr = aSig<<shiftCount;
  542 + *zExpPtr = 1 - shiftCount;
  543 +
  544 +}
  545 +
  546 +/*----------------------------------------------------------------------------
  547 +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  548 +| extended double-precision floating-point value, returning the result.
  549 +*----------------------------------------------------------------------------*/
  550 +
  551 +INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
  552 +{
  553 + floatx80 z;
  554 +
  555 + z.low = zSig;
  556 + z.high = ( ( (bits16) zSign )<<15 ) + zExp;
  557 + return z;
  558 +
  559 +}
  560 +
  561 +/*----------------------------------------------------------------------------
  562 +| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  563 +| and extended significand formed by the concatenation of `zSig0' and `zSig1',
  564 +| and returns the proper extended double-precision floating-point value
  565 +| corresponding to the abstract input. Ordinarily, the abstract value is
  566 +| rounded and packed into the extended double-precision format, with the
  567 +| inexact exception raised if the abstract input cannot be represented
  568 +| exactly. However, if the abstract value is too large, the overflow and
  569 +| inexact exceptions are raised and an infinity or maximal finite value is
  570 +| returned. If the abstract value is too small, the input value is rounded to
  571 +| a subnormal number, and the underflow and inexact exceptions are raised if
  572 +| the abstract input cannot be represented exactly as a subnormal extended
  573 +| double-precision floating-point number.
  574 +| If `roundingPrecision' is 32 or 64, the result is rounded to the same
  575 +| number of bits as single or double precision, respectively. Otherwise, the
  576 +| result is rounded to the full precision of the extended double-precision
  577 +| format.
  578 +| The input significand must be normalized or smaller. If the input
  579 +| significand is not normalized, `zExp' must be 0; in that case, the result
  580 +| returned is a subnormal number, and it must not require rounding. The
  581 +| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  582 +| Floating-Point Arithmetic.
  583 +*----------------------------------------------------------------------------*/
  584 +
  585 +static floatx80
  586 + roundAndPackFloatx80(
  587 + int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
  588 + STATUS_PARAM)
  589 +{
  590 + int8 roundingMode;
  591 + flag roundNearestEven, increment, isTiny;
  592 + int64 roundIncrement, roundMask, roundBits;
  593 +
  594 + roundingMode = STATUS(float_rounding_mode);
  595 + roundNearestEven = ( roundingMode == float_round_nearest_even );
  596 + if ( roundingPrecision == 80 ) goto precision80;
  597 + if ( roundingPrecision == 64 ) {
  598 + roundIncrement = LIT64( 0x0000000000000400 );
  599 + roundMask = LIT64( 0x00000000000007FF );
  600 + }
  601 + else if ( roundingPrecision == 32 ) {
  602 + roundIncrement = LIT64( 0x0000008000000000 );
  603 + roundMask = LIT64( 0x000000FFFFFFFFFF );
  604 + }
  605 + else {
  606 + goto precision80;
  607 + }
  608 + zSig0 |= ( zSig1 != 0 );
  609 + if ( ! roundNearestEven ) {
  610 + if ( roundingMode == float_round_to_zero ) {
  611 + roundIncrement = 0;
  612 + }
  613 + else {
  614 + roundIncrement = roundMask;
  615 + if ( zSign ) {
  616 + if ( roundingMode == float_round_up ) roundIncrement = 0;
  617 + }
  618 + else {
  619 + if ( roundingMode == float_round_down ) roundIncrement = 0;
  620 + }
  621 + }
  622 + }
  623 + roundBits = zSig0 & roundMask;
  624 + if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
  625 + if ( ( 0x7FFE < zExp )
  626 + || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
  627 + ) {
  628 + goto overflow;
  629 + }
  630 + if ( zExp <= 0 ) {
  631 + isTiny =
  632 + ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
  633 + || ( zExp < 0 )
  634 + || ( zSig0 <= zSig0 + roundIncrement );
  635 + shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
  636 + zExp = 0;
  637 + roundBits = zSig0 & roundMask;
  638 + if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
  639 + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
  640 + zSig0 += roundIncrement;
  641 + if ( (sbits64) zSig0 < 0 ) zExp = 1;
  642 + roundIncrement = roundMask + 1;
  643 + if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  644 + roundMask |= roundIncrement;
  645 + }
  646 + zSig0 &= ~ roundMask;
  647 + return packFloatx80( zSign, zExp, zSig0 );
  648 + }
  649 + }
  650 + if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
  651 + zSig0 += roundIncrement;
  652 + if ( zSig0 < roundIncrement ) {
  653 + ++zExp;
  654 + zSig0 = LIT64( 0x8000000000000000 );
  655 + }
  656 + roundIncrement = roundMask + 1;
  657 + if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  658 + roundMask |= roundIncrement;
  659 + }
  660 + zSig0 &= ~ roundMask;
  661 + if ( zSig0 == 0 ) zExp = 0;
  662 + return packFloatx80( zSign, zExp, zSig0 );
  663 + precision80:
  664 + increment = ( (sbits64) zSig1 < 0 );
  665 + if ( ! roundNearestEven ) {
  666 + if ( roundingMode == float_round_to_zero ) {
  667 + increment = 0;
  668 + }
  669 + else {
  670 + if ( zSign ) {
  671 + increment = ( roundingMode == float_round_down ) && zSig1;
  672 + }
  673 + else {
  674 + increment = ( roundingMode == float_round_up ) && zSig1;
  675 + }
  676 + }
  677 + }
  678 + if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
  679 + if ( ( 0x7FFE < zExp )
  680 + || ( ( zExp == 0x7FFE )
  681 + && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
  682 + && increment
  683 + )
  684 + ) {
  685 + roundMask = 0;
  686 + overflow:
  687 + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
  688 + if ( ( roundingMode == float_round_to_zero )
  689 + || ( zSign && ( roundingMode == float_round_up ) )
  690 + || ( ! zSign && ( roundingMode == float_round_down ) )
  691 + ) {
  692 + return packFloatx80( zSign, 0x7FFE, ~ roundMask );
  693 + }
  694 + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  695 + }
  696 + if ( zExp <= 0 ) {
  697 + isTiny =
  698 + ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
  699 + || ( zExp < 0 )
  700 + || ! increment
  701 + || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
  702 + shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
  703 + zExp = 0;
  704 + if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
  705 + if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
  706 + if ( roundNearestEven ) {
  707 + increment = ( (sbits64) zSig1 < 0 );
  708 + }
  709 + else {
  710 + if ( zSign ) {
  711 + increment = ( roundingMode == float_round_down ) && zSig1;
  712 + }
  713 + else {
  714 + increment = ( roundingMode == float_round_up ) && zSig1;
  715 + }
  716 + }
  717 + if ( increment ) {
  718 + ++zSig0;
  719 + zSig0 &=
  720 + ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  721 + if ( (sbits64) zSig0 < 0 ) zExp = 1;
  722 + }
  723 + return packFloatx80( zSign, zExp, zSig0 );
  724 + }
  725 + }
  726 + if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
  727 + if ( increment ) {
  728 + ++zSig0;
  729 + if ( zSig0 == 0 ) {
  730 + ++zExp;
  731 + zSig0 = LIT64( 0x8000000000000000 );
  732 + }
  733 + else {
  734 + zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  735 + }
  736 + }
  737 + else {
  738 + if ( zSig0 == 0 ) zExp = 0;
  739 + }
  740 + return packFloatx80( zSign, zExp, zSig0 );
  741 +
  742 +}
  743 +
  744 +/*----------------------------------------------------------------------------
  745 +| Takes an abstract floating-point value having sign `zSign', exponent
  746 +| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  747 +| and returns the proper extended double-precision floating-point value
  748 +| corresponding to the abstract input. This routine is just like
  749 +| `roundAndPackFloatx80' except that the input significand does not have to be
  750 +| normalized.
  751 +*----------------------------------------------------------------------------*/
  752 +
  753 +static floatx80
  754 + normalizeRoundAndPackFloatx80(
  755 + int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
  756 + STATUS_PARAM)
  757 +{
  758 + int8 shiftCount;
  759 +
  760 + if ( zSig0 == 0 ) {
  761 + zSig0 = zSig1;
  762 + zSig1 = 0;
  763 + zExp -= 64;
  764 + }
  765 + shiftCount = countLeadingZeros64( zSig0 );
  766 + shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  767 + zExp -= shiftCount;
  768 + return
  769 + roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
  770 +
  771 +}
  772 +
  773 +#endif
  774 +
  775 +#ifdef FLOAT128
  776 +
  777 +/*----------------------------------------------------------------------------
  778 +| Returns the least-significant 64 fraction bits of the quadruple-precision
  779 +| floating-point value `a'.
  780 +*----------------------------------------------------------------------------*/
  781 +
  782 +INLINE bits64 extractFloat128Frac1( float128 a )
  783 +{
  784 +
  785 + return a.low;
  786 +
  787 +}
  788 +
  789 +/*----------------------------------------------------------------------------
  790 +| Returns the most-significant 48 fraction bits of the quadruple-precision
  791 +| floating-point value `a'.
  792 +*----------------------------------------------------------------------------*/
  793 +
  794 +INLINE bits64 extractFloat128Frac0( float128 a )
  795 +{
  796 +
  797 + return a.high & LIT64( 0x0000FFFFFFFFFFFF );
  798 +
  799 +}
  800 +
  801 +/*----------------------------------------------------------------------------
  802 +| Returns the exponent bits of the quadruple-precision floating-point value
  803 +| `a'.
  804 +*----------------------------------------------------------------------------*/
  805 +
  806 +INLINE int32 extractFloat128Exp( float128 a )
  807 +{
  808 +
  809 + return ( a.high>>48 ) & 0x7FFF;
  810 +
  811 +}
  812 +
  813 +/*----------------------------------------------------------------------------
  814 +| Returns the sign bit of the quadruple-precision floating-point value `a'.
  815 +*----------------------------------------------------------------------------*/
  816 +
  817 +INLINE flag extractFloat128Sign( float128 a )
  818 +{
  819 +
  820 + return a.high>>63;
  821 +
  822 +}
  823 +
  824 +/*----------------------------------------------------------------------------
  825 +| Normalizes the subnormal quadruple-precision floating-point value
  826 +| represented by the denormalized significand formed by the concatenation of
  827 +| `aSig0' and `aSig1'. The normalized exponent is stored at the location
  828 +| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  829 +| significand are stored at the location pointed to by `zSig0Ptr', and the
  830 +| least significant 64 bits of the normalized significand are stored at the
  831 +| location pointed to by `zSig1Ptr'.
  832 +*----------------------------------------------------------------------------*/
  833 +
  834 +static void
  835 + normalizeFloat128Subnormal(
  836 + bits64 aSig0,
  837 + bits64 aSig1,
  838 + int32 *zExpPtr,
  839 + bits64 *zSig0Ptr,
  840 + bits64 *zSig1Ptr
  841 + )
  842 +{
  843 + int8 shiftCount;
  844 +
  845 + if ( aSig0 == 0 ) {
  846 + shiftCount = countLeadingZeros64( aSig1 ) - 15;
  847 + if ( shiftCount < 0 ) {
  848 + *zSig0Ptr = aSig1>>( - shiftCount );
  849 + *zSig1Ptr = aSig1<<( shiftCount & 63 );
  850 + }
  851 + else {
  852 + *zSig0Ptr = aSig1<<shiftCount;
  853 + *zSig1Ptr = 0;
  854 + }
  855 + *zExpPtr = - shiftCount - 63;
  856 + }
  857 + else {
  858 + shiftCount = countLeadingZeros64( aSig0 ) - 15;
  859 + shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  860 + *zExpPtr = 1 - shiftCount;
  861 + }
  862 +
  863 +}
  864 +
  865 +/*----------------------------------------------------------------------------
  866 +| Packs the sign `zSign', the exponent `zExp', and the significand formed
  867 +| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  868 +| floating-point value, returning the result. After being shifted into the
  869 +| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  870 +| added together to form the most significant 32 bits of the result. This
  871 +| means that any integer portion of `zSig0' will be added into the exponent.
  872 +| Since a properly normalized significand will have an integer portion equal
  873 +| to 1, the `zExp' input should be 1 less than the desired result exponent
  874 +| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  875 +| significand.
  876 +*----------------------------------------------------------------------------*/
  877 +
  878 +INLINE float128
  879 + packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
  880 +{
  881 + float128 z;
  882 +
  883 + z.low = zSig1;
  884 + z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
  885 + return z;
  886 +
  887 +}
  888 +
  889 +/*----------------------------------------------------------------------------
  890 +| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  891 +| and extended significand formed by the concatenation of `zSig0', `zSig1',
  892 +| and `zSig2', and returns the proper quadruple-precision floating-point value
  893 +| corresponding to the abstract input. Ordinarily, the abstract value is
  894 +| simply rounded and packed into the quadruple-precision format, with the
  895 +| inexact exception raised if the abstract input cannot be represented
  896 +| exactly. However, if the abstract value is too large, the overflow and
  897 +| inexact exceptions are raised and an infinity or maximal finite value is
  898 +| returned. If the abstract value is too small, the input value is rounded to
  899 +| a subnormal number, and the underflow and inexact exceptions are raised if
  900 +| the abstract input cannot be represented exactly as a subnormal quadruple-
  901 +| precision floating-point number.
  902 +| The input significand must be normalized or smaller. If the input
  903 +| significand is not normalized, `zExp' must be 0; in that case, the result
  904 +| returned is a subnormal number, and it must not require rounding. In the
  905 +| usual case that the input significand is normalized, `zExp' must be 1 less
  906 +| than the ``true'' floating-point exponent. The handling of underflow and
  907 +| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  908 +*----------------------------------------------------------------------------*/
  909 +
  910 +static float128
  911 + roundAndPackFloat128(
  912 + flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
  913 +{
  914 + int8 roundingMode;
  915 + flag roundNearestEven, increment, isTiny;
  916 +
  917 + roundingMode = STATUS(float_rounding_mode);
  918 + roundNearestEven = ( roundingMode == float_round_nearest_even );
  919 + increment = ( (sbits64) zSig2 < 0 );
  920 + if ( ! roundNearestEven ) {
  921 + if ( roundingMode == float_round_to_zero ) {
  922 + increment = 0;
  923 + }
  924 + else {
  925 + if ( zSign ) {
  926 + increment = ( roundingMode == float_round_down ) && zSig2;
  927 + }
  928 + else {
  929 + increment = ( roundingMode == float_round_up ) && zSig2;
  930 + }
  931 + }
  932 + }
  933 + if ( 0x7FFD <= (bits32) zExp ) {
  934 + if ( ( 0x7FFD < zExp )
  935 + || ( ( zExp == 0x7FFD )
  936 + && eq128(
  937 + LIT64( 0x0001FFFFFFFFFFFF ),
  938 + LIT64( 0xFFFFFFFFFFFFFFFF ),
  939 + zSig0,
  940 + zSig1
  941 + )
  942 + && increment
  943 + )
  944 + ) {
  945 + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
  946 + if ( ( roundingMode == float_round_to_zero )
  947 + || ( zSign && ( roundingMode == float_round_up ) )
  948 + || ( ! zSign && ( roundingMode == float_round_down ) )
  949 + ) {
  950 + return
  951 + packFloat128(
  952 + zSign,
  953 + 0x7FFE,
  954 + LIT64( 0x0000FFFFFFFFFFFF ),
  955 + LIT64( 0xFFFFFFFFFFFFFFFF )
  956 + );
  957 + }
  958 + return packFloat128( zSign, 0x7FFF, 0, 0 );
  959 + }
  960 + if ( zExp < 0 ) {
  961 + isTiny =
  962 + ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
  963 + || ( zExp < -1 )
  964 + || ! increment
  965 + || lt128(
  966 + zSig0,
  967 + zSig1,
  968 + LIT64( 0x0001FFFFFFFFFFFF ),
  969 + LIT64( 0xFFFFFFFFFFFFFFFF )
  970 + );
  971 + shift128ExtraRightJamming(
  972 + zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
  973 + zExp = 0;
  974 + if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
  975 + if ( roundNearestEven ) {
  976 + increment = ( (sbits64) zSig2 < 0 );
  977 + }
  978 + else {
  979 + if ( zSign ) {
  980 + increment = ( roundingMode == float_round_down ) && zSig2;
  981 + }
  982 + else {
  983 + increment = ( roundingMode == float_round_up ) && zSig2;
  984 + }
  985 + }
  986 + }
  987 + }
  988 + if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
  989 + if ( increment ) {
  990 + add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
  991 + zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
  992 + }
  993 + else {
  994 + if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
  995 + }
  996 + return packFloat128( zSign, zExp, zSig0, zSig1 );
  997 +
  998 +}
  999 +
  1000 +/*----------------------------------------------------------------------------
  1001 +| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1002 +| and significand formed by the concatenation of `zSig0' and `zSig1', and
  1003 +| returns the proper quadruple-precision floating-point value corresponding
  1004 +| to the abstract input. This routine is just like `roundAndPackFloat128'
  1005 +| except that the input significand has fewer bits and does not have to be
  1006 +| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  1007 +| point exponent.
  1008 +*----------------------------------------------------------------------------*/
  1009 +
  1010 +static float128
  1011 + normalizeRoundAndPackFloat128(
  1012 + flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
  1013 +{
  1014 + int8 shiftCount;
  1015 + bits64 zSig2;
  1016 +
  1017 + if ( zSig0 == 0 ) {
  1018 + zSig0 = zSig1;
  1019 + zSig1 = 0;
  1020 + zExp -= 64;
  1021 + }
  1022 + shiftCount = countLeadingZeros64( zSig0 ) - 15;
  1023 + if ( 0 <= shiftCount ) {
  1024 + zSig2 = 0;
  1025 + shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  1026 + }
  1027 + else {
  1028 + shift128ExtraRightJamming(
  1029 + zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
  1030 + }
  1031 + zExp -= shiftCount;
  1032 + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
  1033 +
  1034 +}
  1035 +
  1036 +#endif
  1037 +
  1038 +/*----------------------------------------------------------------------------
  1039 +| Returns the result of converting the 32-bit two's complement integer `a'
  1040 +| to the single-precision floating-point format. The conversion is performed
  1041 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1042 +*----------------------------------------------------------------------------*/
  1043 +
  1044 +float32 int32_to_float32( int32 a STATUS_PARAM )
  1045 +{
  1046 + flag zSign;
  1047 +
  1048 + if ( a == 0 ) return 0;
  1049 + if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
  1050 + zSign = ( a < 0 );
  1051 + return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
  1052 +
  1053 +}
  1054 +
  1055 +/*----------------------------------------------------------------------------
  1056 +| Returns the result of converting the 32-bit two's complement integer `a'
  1057 +| to the double-precision floating-point format. The conversion is performed
  1058 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1059 +*----------------------------------------------------------------------------*/
  1060 +
  1061 +float64 int32_to_float64( int32 a STATUS_PARAM )
  1062 +{
  1063 + flag zSign;
  1064 + uint32 absA;
  1065 + int8 shiftCount;
  1066 + bits64 zSig;
  1067 +
  1068 + if ( a == 0 ) return 0;
  1069 + zSign = ( a < 0 );
  1070 + absA = zSign ? - a : a;
  1071 + shiftCount = countLeadingZeros32( absA ) + 21;
  1072 + zSig = absA;
  1073 + return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
  1074 +
  1075 +}
  1076 +
  1077 +#ifdef FLOATX80
  1078 +
  1079 +/*----------------------------------------------------------------------------
  1080 +| Returns the result of converting the 32-bit two's complement integer `a'
  1081 +| to the extended double-precision floating-point format. The conversion
  1082 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  1083 +| Arithmetic.
  1084 +*----------------------------------------------------------------------------*/
  1085 +
  1086 +floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
  1087 +{
  1088 + flag zSign;
  1089 + uint32 absA;
  1090 + int8 shiftCount;
  1091 + bits64 zSig;
  1092 +
  1093 + if ( a == 0 ) return packFloatx80( 0, 0, 0 );
  1094 + zSign = ( a < 0 );
  1095 + absA = zSign ? - a : a;
  1096 + shiftCount = countLeadingZeros32( absA ) + 32;
  1097 + zSig = absA;
  1098 + return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
  1099 +
  1100 +}
  1101 +
  1102 +#endif
  1103 +
  1104 +#ifdef FLOAT128
  1105 +
  1106 +/*----------------------------------------------------------------------------
  1107 +| Returns the result of converting the 32-bit two's complement integer `a' to
  1108 +| the quadruple-precision floating-point format. The conversion is performed
  1109 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1110 +*----------------------------------------------------------------------------*/
  1111 +
  1112 +float128 int32_to_float128( int32 a STATUS_PARAM )
  1113 +{
  1114 + flag zSign;
  1115 + uint32 absA;
  1116 + int8 shiftCount;
  1117 + bits64 zSig0;
  1118 +
  1119 + if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
  1120 + zSign = ( a < 0 );
  1121 + absA = zSign ? - a : a;
  1122 + shiftCount = countLeadingZeros32( absA ) + 17;
  1123 + zSig0 = absA;
  1124 + return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
  1125 +
  1126 +}
  1127 +
  1128 +#endif
  1129 +
  1130 +/*----------------------------------------------------------------------------
  1131 +| Returns the result of converting the 64-bit two's complement integer `a'
  1132 +| to the single-precision floating-point format. The conversion is performed
  1133 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1134 +*----------------------------------------------------------------------------*/
  1135 +
  1136 +float32 int64_to_float32( int64 a STATUS_PARAM )
  1137 +{
  1138 + flag zSign;
  1139 + uint64 absA;
  1140 + int8 shiftCount;
  1141 +
  1142 + if ( a == 0 ) return 0;
  1143 + zSign = ( a < 0 );
  1144 + absA = zSign ? - a : a;
  1145 + shiftCount = countLeadingZeros64( absA ) - 40;
  1146 + if ( 0 <= shiftCount ) {
  1147 + return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
  1148 + }
  1149 + else {
  1150 + shiftCount += 7;
  1151 + if ( shiftCount < 0 ) {
  1152 + shift64RightJamming( absA, - shiftCount, &absA );
  1153 + }
  1154 + else {
  1155 + absA <<= shiftCount;
  1156 + }
  1157 + return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
  1158 + }
  1159 +
  1160 +}
  1161 +
  1162 +/*----------------------------------------------------------------------------
  1163 +| Returns the result of converting the 64-bit two's complement integer `a'
  1164 +| to the double-precision floating-point format. The conversion is performed
  1165 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1166 +*----------------------------------------------------------------------------*/
  1167 +
  1168 +float64 int64_to_float64( int64 a STATUS_PARAM )
  1169 +{
  1170 + flag zSign;
  1171 +
  1172 + if ( a == 0 ) return 0;
  1173 + if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
  1174 + return packFloat64( 1, 0x43E, 0 );
  1175 + }
  1176 + zSign = ( a < 0 );
  1177 + return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
  1178 +
  1179 +}
  1180 +
  1181 +#ifdef FLOATX80
  1182 +
  1183 +/*----------------------------------------------------------------------------
  1184 +| Returns the result of converting the 64-bit two's complement integer `a'
  1185 +| to the extended double-precision floating-point format. The conversion
  1186 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  1187 +| Arithmetic.
  1188 +*----------------------------------------------------------------------------*/
  1189 +
  1190 +floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
  1191 +{
  1192 + flag zSign;
  1193 + uint64 absA;
  1194 + int8 shiftCount;
  1195 +
  1196 + if ( a == 0 ) return packFloatx80( 0, 0, 0 );
  1197 + zSign = ( a < 0 );
  1198 + absA = zSign ? - a : a;
  1199 + shiftCount = countLeadingZeros64( absA );
  1200 + return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
  1201 +
  1202 +}
  1203 +
  1204 +#endif
  1205 +
  1206 +#ifdef FLOAT128
  1207 +
  1208 +/*----------------------------------------------------------------------------
  1209 +| Returns the result of converting the 64-bit two's complement integer `a' to
  1210 +| the quadruple-precision floating-point format. The conversion is performed
  1211 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1212 +*----------------------------------------------------------------------------*/
  1213 +
  1214 +float128 int64_to_float128( int64 a STATUS_PARAM )
  1215 +{
  1216 + flag zSign;
  1217 + uint64 absA;
  1218 + int8 shiftCount;
  1219 + int32 zExp;
  1220 + bits64 zSig0, zSig1;
  1221 +
  1222 + if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
  1223 + zSign = ( a < 0 );
  1224 + absA = zSign ? - a : a;
  1225 + shiftCount = countLeadingZeros64( absA ) + 49;
  1226 + zExp = 0x406E - shiftCount;
  1227 + if ( 64 <= shiftCount ) {
  1228 + zSig1 = 0;
  1229 + zSig0 = absA;
  1230 + shiftCount -= 64;
  1231 + }
  1232 + else {
  1233 + zSig1 = absA;
  1234 + zSig0 = 0;
  1235 + }
  1236 + shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  1237 + return packFloat128( zSign, zExp, zSig0, zSig1 );
  1238 +
  1239 +}
  1240 +
  1241 +#endif
  1242 +
  1243 +/*----------------------------------------------------------------------------
  1244 +| Returns the result of converting the single-precision floating-point value
  1245 +| `a' to the 32-bit two's complement integer format. The conversion is
  1246 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  1247 +| Arithmetic---which means in particular that the conversion is rounded
  1248 +| according to the current rounding mode. If `a' is a NaN, the largest
  1249 +| positive integer is returned. Otherwise, if the conversion overflows, the
  1250 +| largest integer with the same sign as `a' is returned.
  1251 +*----------------------------------------------------------------------------*/
  1252 +
  1253 +int32 float32_to_int32( float32 a STATUS_PARAM )
  1254 +{
  1255 + flag aSign;
  1256 + int16 aExp, shiftCount;
  1257 + bits32 aSig;
  1258 + bits64 aSig64;
  1259 +
  1260 + aSig = extractFloat32Frac( a );
  1261 + aExp = extractFloat32Exp( a );
  1262 + aSign = extractFloat32Sign( a );
  1263 + if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
  1264 + if ( aExp ) aSig |= 0x00800000;
  1265 + shiftCount = 0xAF - aExp;
  1266 + aSig64 = aSig;
  1267 + aSig64 <<= 32;
  1268 + if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
  1269 + return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
  1270 +
  1271 +}
  1272 +
  1273 +/*----------------------------------------------------------------------------
  1274 +| Returns the result of converting the single-precision floating-point value
  1275 +| `a' to the 32-bit two's complement integer format. The conversion is
  1276 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  1277 +| Arithmetic, except that the conversion is always rounded toward zero.
  1278 +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  1279 +| the conversion overflows, the largest integer with the same sign as `a' is
  1280 +| returned.
  1281 +*----------------------------------------------------------------------------*/
  1282 +
  1283 +int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
  1284 +{
  1285 + flag aSign;
  1286 + int16 aExp, shiftCount;
  1287 + bits32 aSig;
  1288 + int32 z;
  1289 +
  1290 + aSig = extractFloat32Frac( a );
  1291 + aExp = extractFloat32Exp( a );
  1292 + aSign = extractFloat32Sign( a );
  1293 + shiftCount = aExp - 0x9E;
  1294 + if ( 0 <= shiftCount ) {
  1295 + if ( a != 0xCF000000 ) {
  1296 + float_raise( float_flag_invalid STATUS_VAR);
  1297 + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
  1298 + }
  1299 + return (sbits32) 0x80000000;
  1300 + }
  1301 + else if ( aExp <= 0x7E ) {
  1302 + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
  1303 + return 0;
  1304 + }
  1305 + aSig = ( aSig | 0x00800000 )<<8;
  1306 + z = aSig>>( - shiftCount );
  1307 + if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
  1308 + STATUS(float_exception_flags) |= float_flag_inexact;
  1309 + }
  1310 + if ( aSign ) z = - z;
  1311 + return z;
  1312 +
  1313 +}
  1314 +
  1315 +/*----------------------------------------------------------------------------
  1316 +| Returns the result of converting the single-precision floating-point value
  1317 +| `a' to the 64-bit two's complement integer format. The conversion is
  1318 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  1319 +| Arithmetic---which means in particular that the conversion is rounded
  1320 +| according to the current rounding mode. If `a' is a NaN, the largest
  1321 +| positive integer is returned. Otherwise, if the conversion overflows, the
  1322 +| largest integer with the same sign as `a' is returned.
  1323 +*----------------------------------------------------------------------------*/
  1324 +
  1325 +int64 float32_to_int64( float32 a STATUS_PARAM )
  1326 +{
  1327 + flag aSign;
  1328 + int16 aExp, shiftCount;
  1329 + bits32 aSig;
  1330 + bits64 aSig64, aSigExtra;
  1331 +
  1332 + aSig = extractFloat32Frac( a );
  1333 + aExp = extractFloat32Exp( a );
  1334 + aSign = extractFloat32Sign( a );
  1335 + shiftCount = 0xBE - aExp;
  1336 + if ( shiftCount < 0 ) {
  1337 + float_raise( float_flag_invalid STATUS_VAR);
  1338 + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
  1339 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  1340 + }
  1341 + return (sbits64) LIT64( 0x8000000000000000 );
  1342 + }
  1343 + if ( aExp ) aSig |= 0x00800000;
  1344 + aSig64 = aSig;
  1345 + aSig64 <<= 40;
  1346 + shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
  1347 + return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
  1348 +
  1349 +}
  1350 +
  1351 +/*----------------------------------------------------------------------------
  1352 +| Returns the result of converting the single-precision floating-point value
  1353 +| `a' to the 64-bit two's complement integer format. The conversion is
  1354 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  1355 +| Arithmetic, except that the conversion is always rounded toward zero. If
  1356 +| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  1357 +| conversion overflows, the largest integer with the same sign as `a' is
  1358 +| returned.
  1359 +*----------------------------------------------------------------------------*/
  1360 +
  1361 +int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
  1362 +{
  1363 + flag aSign;
  1364 + int16 aExp, shiftCount;
  1365 + bits32 aSig;
  1366 + bits64 aSig64;
  1367 + int64 z;
  1368 +
  1369 + aSig = extractFloat32Frac( a );
  1370 + aExp = extractFloat32Exp( a );
  1371 + aSign = extractFloat32Sign( a );
  1372 + shiftCount = aExp - 0xBE;
  1373 + if ( 0 <= shiftCount ) {
  1374 + if ( a != 0xDF000000 ) {
  1375 + float_raise( float_flag_invalid STATUS_VAR);
  1376 + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
  1377 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  1378 + }
  1379 + }
  1380 + return (sbits64) LIT64( 0x8000000000000000 );
  1381 + }
  1382 + else if ( aExp <= 0x7E ) {
  1383 + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
  1384 + return 0;
  1385 + }
  1386 + aSig64 = aSig | 0x00800000;
  1387 + aSig64 <<= 40;
  1388 + z = aSig64>>( - shiftCount );
  1389 + if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
  1390 + STATUS(float_exception_flags) |= float_flag_inexact;
  1391 + }
  1392 + if ( aSign ) z = - z;
  1393 + return z;
  1394 +
  1395 +}
  1396 +
  1397 +/*----------------------------------------------------------------------------
  1398 +| Returns the result of converting the single-precision floating-point value
  1399 +| `a' to the double-precision floating-point format. The conversion is
  1400 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  1401 +| Arithmetic.
  1402 +*----------------------------------------------------------------------------*/
  1403 +
  1404 +float64 float32_to_float64( float32 a STATUS_PARAM )
  1405 +{
  1406 + flag aSign;
  1407 + int16 aExp;
  1408 + bits32 aSig;
  1409 +
  1410 + aSig = extractFloat32Frac( a );
  1411 + aExp = extractFloat32Exp( a );
  1412 + aSign = extractFloat32Sign( a );
  1413 + if ( aExp == 0xFF ) {
  1414 + if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
  1415 + return packFloat64( aSign, 0x7FF, 0 );
  1416 + }
  1417 + if ( aExp == 0 ) {
  1418 + if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
  1419 + normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1420 + --aExp;
  1421 + }
  1422 + return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
  1423 +
  1424 +}
  1425 +
  1426 +#ifdef FLOATX80
  1427 +
  1428 +/*----------------------------------------------------------------------------
  1429 +| Returns the result of converting the single-precision floating-point value
  1430 +| `a' to the extended double-precision floating-point format. The conversion
  1431 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  1432 +| Arithmetic.
  1433 +*----------------------------------------------------------------------------*/
  1434 +
  1435 +floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
  1436 +{
  1437 + flag aSign;
  1438 + int16 aExp;
  1439 + bits32 aSig;
  1440 +
  1441 + aSig = extractFloat32Frac( a );
  1442 + aExp = extractFloat32Exp( a );
  1443 + aSign = extractFloat32Sign( a );
  1444 + if ( aExp == 0xFF ) {
  1445 + if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
  1446 + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  1447 + }
  1448 + if ( aExp == 0 ) {
  1449 + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
  1450 + normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1451 + }
  1452 + aSig |= 0x00800000;
  1453 + return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
  1454 +
  1455 +}
  1456 +
  1457 +#endif
  1458 +
  1459 +#ifdef FLOAT128
  1460 +
  1461 +/*----------------------------------------------------------------------------
  1462 +| Returns the result of converting the single-precision floating-point value
  1463 +| `a' to the double-precision floating-point format. The conversion is
  1464 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  1465 +| Arithmetic.
  1466 +*----------------------------------------------------------------------------*/
  1467 +
  1468 +float128 float32_to_float128( float32 a STATUS_PARAM )
  1469 +{
  1470 + flag aSign;
  1471 + int16 aExp;
  1472 + bits32 aSig;
  1473 +
  1474 + aSig = extractFloat32Frac( a );
  1475 + aExp = extractFloat32Exp( a );
  1476 + aSign = extractFloat32Sign( a );
  1477 + if ( aExp == 0xFF ) {
  1478 + if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
  1479 + return packFloat128( aSign, 0x7FFF, 0, 0 );
  1480 + }
  1481 + if ( aExp == 0 ) {
  1482 + if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
  1483 + normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1484 + --aExp;
  1485 + }
  1486 + return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
  1487 +
  1488 +}
  1489 +
  1490 +#endif
  1491 +
  1492 +/*----------------------------------------------------------------------------
  1493 +| Rounds the single-precision floating-point value `a' to an integer, and
  1494 +| returns the result as a single-precision floating-point value. The
  1495 +| operation is performed according to the IEC/IEEE Standard for Binary
  1496 +| Floating-Point Arithmetic.
  1497 +*----------------------------------------------------------------------------*/
  1498 +
  1499 +float32 float32_round_to_int( float32 a STATUS_PARAM)
  1500 +{
  1501 + flag aSign;
  1502 + int16 aExp;
  1503 + bits32 lastBitMask, roundBitsMask;
  1504 + int8 roundingMode;
  1505 + float32 z;
  1506 +
  1507 + aExp = extractFloat32Exp( a );
  1508 + if ( 0x96 <= aExp ) {
  1509 + if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
  1510 + return propagateFloat32NaN( a, a STATUS_VAR );
  1511 + }
  1512 + return a;
  1513 + }
  1514 + if ( aExp <= 0x7E ) {
  1515 + if ( (bits32) ( a<<1 ) == 0 ) return a;
  1516 + STATUS(float_exception_flags) |= float_flag_inexact;
  1517 + aSign = extractFloat32Sign( a );
  1518 + switch ( STATUS(float_rounding_mode) ) {
  1519 + case float_round_nearest_even:
  1520 + if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
  1521 + return packFloat32( aSign, 0x7F, 0 );
  1522 + }
  1523 + break;
  1524 + case float_round_down:
  1525 + return aSign ? 0xBF800000 : 0;
  1526 + case float_round_up:
  1527 + return aSign ? 0x80000000 : 0x3F800000;
  1528 + }
  1529 + return packFloat32( aSign, 0, 0 );
  1530 + }
  1531 + lastBitMask = 1;
  1532 + lastBitMask <<= 0x96 - aExp;
  1533 + roundBitsMask = lastBitMask - 1;
  1534 + z = a;
  1535 + roundingMode = STATUS(float_rounding_mode);
  1536 + if ( roundingMode == float_round_nearest_even ) {
  1537 + z += lastBitMask>>1;
  1538 + if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
  1539 + }
  1540 + else if ( roundingMode != float_round_to_zero ) {
  1541 + if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
  1542 + z += roundBitsMask;
  1543 + }
  1544 + }
  1545 + z &= ~ roundBitsMask;
  1546 + if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
  1547 + return z;
  1548 +
  1549 +}
  1550 +
  1551 +/*----------------------------------------------------------------------------
  1552 +| Returns the result of adding the absolute values of the single-precision
  1553 +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  1554 +| before being returned. `zSign' is ignored if the result is a NaN.
  1555 +| The addition is performed according to the IEC/IEEE Standard for Binary
  1556 +| Floating-Point Arithmetic.
  1557 +*----------------------------------------------------------------------------*/
  1558 +
  1559 +static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
  1560 +{
  1561 + int16 aExp, bExp, zExp;
  1562 + bits32 aSig, bSig, zSig;
  1563 + int16 expDiff;
  1564 +
  1565 + aSig = extractFloat32Frac( a );
  1566 + aExp = extractFloat32Exp( a );
  1567 + bSig = extractFloat32Frac( b );
  1568 + bExp = extractFloat32Exp( b );
  1569 + expDiff = aExp - bExp;
  1570 + aSig <<= 6;
  1571 + bSig <<= 6;
  1572 + if ( 0 < expDiff ) {
  1573 + if ( aExp == 0xFF ) {
  1574 + if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1575 + return a;
  1576 + }
  1577 + if ( bExp == 0 ) {
  1578 + --expDiff;
  1579 + }
  1580 + else {
  1581 + bSig |= 0x20000000;
  1582 + }
  1583 + shift32RightJamming( bSig, expDiff, &bSig );
  1584 + zExp = aExp;
  1585 + }
  1586 + else if ( expDiff < 0 ) {
  1587 + if ( bExp == 0xFF ) {
  1588 + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1589 + return packFloat32( zSign, 0xFF, 0 );
  1590 + }
  1591 + if ( aExp == 0 ) {
  1592 + ++expDiff;
  1593 + }
  1594 + else {
  1595 + aSig |= 0x20000000;
  1596 + }
  1597 + shift32RightJamming( aSig, - expDiff, &aSig );
  1598 + zExp = bExp;
  1599 + }
  1600 + else {
  1601 + if ( aExp == 0xFF ) {
  1602 + if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1603 + return a;
  1604 + }
  1605 + if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
  1606 + zSig = 0x40000000 + aSig + bSig;
  1607 + zExp = aExp;
  1608 + goto roundAndPack;
  1609 + }
  1610 + aSig |= 0x20000000;
  1611 + zSig = ( aSig + bSig )<<1;
  1612 + --zExp;
  1613 + if ( (sbits32) zSig < 0 ) {
  1614 + zSig = aSig + bSig;
  1615 + ++zExp;
  1616 + }
  1617 + roundAndPack:
  1618 + return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
  1619 +
  1620 +}
  1621 +
  1622 +/*----------------------------------------------------------------------------
  1623 +| Returns the result of subtracting the absolute values of the single-
  1624 +| precision floating-point values `a' and `b'. If `zSign' is 1, the
  1625 +| difference is negated before being returned. `zSign' is ignored if the
  1626 +| result is a NaN. The subtraction is performed according to the IEC/IEEE
  1627 +| Standard for Binary Floating-Point Arithmetic.
  1628 +*----------------------------------------------------------------------------*/
  1629 +
  1630 +static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
  1631 +{
  1632 + int16 aExp, bExp, zExp;
  1633 + bits32 aSig, bSig, zSig;
  1634 + int16 expDiff;
  1635 +
  1636 + aSig = extractFloat32Frac( a );
  1637 + aExp = extractFloat32Exp( a );
  1638 + bSig = extractFloat32Frac( b );
  1639 + bExp = extractFloat32Exp( b );
  1640 + expDiff = aExp - bExp;
  1641 + aSig <<= 7;
  1642 + bSig <<= 7;
  1643 + if ( 0 < expDiff ) goto aExpBigger;
  1644 + if ( expDiff < 0 ) goto bExpBigger;
  1645 + if ( aExp == 0xFF ) {
  1646 + if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1647 + float_raise( float_flag_invalid STATUS_VAR);
  1648 + return float32_default_nan;
  1649 + }
  1650 + if ( aExp == 0 ) {
  1651 + aExp = 1;
  1652 + bExp = 1;
  1653 + }
  1654 + if ( bSig < aSig ) goto aBigger;
  1655 + if ( aSig < bSig ) goto bBigger;
  1656 + return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
  1657 + bExpBigger:
  1658 + if ( bExp == 0xFF ) {
  1659 + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1660 + return packFloat32( zSign ^ 1, 0xFF, 0 );
  1661 + }
  1662 + if ( aExp == 0 ) {
  1663 + ++expDiff;
  1664 + }
  1665 + else {
  1666 + aSig |= 0x40000000;
  1667 + }
  1668 + shift32RightJamming( aSig, - expDiff, &aSig );
  1669 + bSig |= 0x40000000;
  1670 + bBigger:
  1671 + zSig = bSig - aSig;
  1672 + zExp = bExp;
  1673 + zSign ^= 1;
  1674 + goto normalizeRoundAndPack;
  1675 + aExpBigger:
  1676 + if ( aExp == 0xFF ) {
  1677 + if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1678 + return a;
  1679 + }
  1680 + if ( bExp == 0 ) {
  1681 + --expDiff;
  1682 + }
  1683 + else {
  1684 + bSig |= 0x40000000;
  1685 + }
  1686 + shift32RightJamming( bSig, expDiff, &bSig );
  1687 + aSig |= 0x40000000;
  1688 + aBigger:
  1689 + zSig = aSig - bSig;
  1690 + zExp = aExp;
  1691 + normalizeRoundAndPack:
  1692 + --zExp;
  1693 + return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
  1694 +
  1695 +}
  1696 +
  1697 +/*----------------------------------------------------------------------------
  1698 +| Returns the result of adding the single-precision floating-point values `a'
  1699 +| and `b'. The operation is performed according to the IEC/IEEE Standard for
  1700 +| Binary Floating-Point Arithmetic.
  1701 +*----------------------------------------------------------------------------*/
  1702 +
  1703 +float32 float32_add( float32 a, float32 b STATUS_PARAM )
  1704 +{
  1705 + flag aSign, bSign;
  1706 +
  1707 + aSign = extractFloat32Sign( a );
  1708 + bSign = extractFloat32Sign( b );
  1709 + if ( aSign == bSign ) {
  1710 + return addFloat32Sigs( a, b, aSign STATUS_VAR);
  1711 + }
  1712 + else {
  1713 + return subFloat32Sigs( a, b, aSign STATUS_VAR );
  1714 + }
  1715 +
  1716 +}
  1717 +
  1718 +/*----------------------------------------------------------------------------
  1719 +| Returns the result of subtracting the single-precision floating-point values
  1720 +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  1721 +| for Binary Floating-Point Arithmetic.
  1722 +*----------------------------------------------------------------------------*/
  1723 +
  1724 +float32 float32_sub( float32 a, float32 b STATUS_PARAM )
  1725 +{
  1726 + flag aSign, bSign;
  1727 +
  1728 + aSign = extractFloat32Sign( a );
  1729 + bSign = extractFloat32Sign( b );
  1730 + if ( aSign == bSign ) {
  1731 + return subFloat32Sigs( a, b, aSign STATUS_VAR );
  1732 + }
  1733 + else {
  1734 + return addFloat32Sigs( a, b, aSign STATUS_VAR );
  1735 + }
  1736 +
  1737 +}
  1738 +
  1739 +/*----------------------------------------------------------------------------
  1740 +| Returns the result of multiplying the single-precision floating-point values
  1741 +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  1742 +| for Binary Floating-Point Arithmetic.
  1743 +*----------------------------------------------------------------------------*/
  1744 +
  1745 +float32 float32_mul( float32 a, float32 b STATUS_PARAM )
  1746 +{
  1747 + flag aSign, bSign, zSign;
  1748 + int16 aExp, bExp, zExp;
  1749 + bits32 aSig, bSig;
  1750 + bits64 zSig64;
  1751 + bits32 zSig;
  1752 +
  1753 + aSig = extractFloat32Frac( a );
  1754 + aExp = extractFloat32Exp( a );
  1755 + aSign = extractFloat32Sign( a );
  1756 + bSig = extractFloat32Frac( b );
  1757 + bExp = extractFloat32Exp( b );
  1758 + bSign = extractFloat32Sign( b );
  1759 + zSign = aSign ^ bSign;
  1760 + if ( aExp == 0xFF ) {
  1761 + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
  1762 + return propagateFloat32NaN( a, b STATUS_VAR );
  1763 + }
  1764 + if ( ( bExp | bSig ) == 0 ) {
  1765 + float_raise( float_flag_invalid STATUS_VAR);
  1766 + return float32_default_nan;
  1767 + }
  1768 + return packFloat32( zSign, 0xFF, 0 );
  1769 + }
  1770 + if ( bExp == 0xFF ) {
  1771 + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1772 + if ( ( aExp | aSig ) == 0 ) {
  1773 + float_raise( float_flag_invalid STATUS_VAR);
  1774 + return float32_default_nan;
  1775 + }
  1776 + return packFloat32( zSign, 0xFF, 0 );
  1777 + }
  1778 + if ( aExp == 0 ) {
  1779 + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
  1780 + normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1781 + }
  1782 + if ( bExp == 0 ) {
  1783 + if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
  1784 + normalizeFloat32Subnormal( bSig, &bExp, &bSig );
  1785 + }
  1786 + zExp = aExp + bExp - 0x7F;
  1787 + aSig = ( aSig | 0x00800000 )<<7;
  1788 + bSig = ( bSig | 0x00800000 )<<8;
  1789 + shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
  1790 + zSig = zSig64;
  1791 + if ( 0 <= (sbits32) ( zSig<<1 ) ) {
  1792 + zSig <<= 1;
  1793 + --zExp;
  1794 + }
  1795 + return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
  1796 +
  1797 +}
  1798 +
  1799 +/*----------------------------------------------------------------------------
  1800 +| Returns the result of dividing the single-precision floating-point value `a'
  1801 +| by the corresponding value `b'. The operation is performed according to the
  1802 +| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1803 +*----------------------------------------------------------------------------*/
  1804 +
  1805 +float32 float32_div( float32 a, float32 b STATUS_PARAM )
  1806 +{
  1807 + flag aSign, bSign, zSign;
  1808 + int16 aExp, bExp, zExp;
  1809 + bits32 aSig, bSig, zSig;
  1810 +
  1811 + aSig = extractFloat32Frac( a );
  1812 + aExp = extractFloat32Exp( a );
  1813 + aSign = extractFloat32Sign( a );
  1814 + bSig = extractFloat32Frac( b );
  1815 + bExp = extractFloat32Exp( b );
  1816 + bSign = extractFloat32Sign( b );
  1817 + zSign = aSign ^ bSign;
  1818 + if ( aExp == 0xFF ) {
  1819 + if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1820 + if ( bExp == 0xFF ) {
  1821 + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1822 + float_raise( float_flag_invalid STATUS_VAR);
  1823 + return float32_default_nan;
  1824 + }
  1825 + return packFloat32( zSign, 0xFF, 0 );
  1826 + }
  1827 + if ( bExp == 0xFF ) {
  1828 + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1829 + return packFloat32( zSign, 0, 0 );
  1830 + }
  1831 + if ( bExp == 0 ) {
  1832 + if ( bSig == 0 ) {
  1833 + if ( ( aExp | aSig ) == 0 ) {
  1834 + float_raise( float_flag_invalid STATUS_VAR);
  1835 + return float32_default_nan;
  1836 + }
  1837 + float_raise( float_flag_divbyzero STATUS_VAR);
  1838 + return packFloat32( zSign, 0xFF, 0 );
  1839 + }
  1840 + normalizeFloat32Subnormal( bSig, &bExp, &bSig );
  1841 + }
  1842 + if ( aExp == 0 ) {
  1843 + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
  1844 + normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1845 + }
  1846 + zExp = aExp - bExp + 0x7D;
  1847 + aSig = ( aSig | 0x00800000 )<<7;
  1848 + bSig = ( bSig | 0x00800000 )<<8;
  1849 + if ( bSig <= ( aSig + aSig ) ) {
  1850 + aSig >>= 1;
  1851 + ++zExp;
  1852 + }
  1853 + zSig = ( ( (bits64) aSig )<<32 ) / bSig;
  1854 + if ( ( zSig & 0x3F ) == 0 ) {
  1855 + zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
  1856 + }
  1857 + return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
  1858 +
  1859 +}
  1860 +
  1861 +/*----------------------------------------------------------------------------
  1862 +| Returns the remainder of the single-precision floating-point value `a'
  1863 +| with respect to the corresponding value `b'. The operation is performed
  1864 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1865 +*----------------------------------------------------------------------------*/
  1866 +
  1867 +float32 float32_rem( float32 a, float32 b STATUS_PARAM )
  1868 +{
  1869 + flag aSign, bSign, zSign;
  1870 + int16 aExp, bExp, expDiff;
  1871 + bits32 aSig, bSig;
  1872 + bits32 q;
  1873 + bits64 aSig64, bSig64, q64;
  1874 + bits32 alternateASig;
  1875 + sbits32 sigMean;
  1876 +
  1877 + aSig = extractFloat32Frac( a );
  1878 + aExp = extractFloat32Exp( a );
  1879 + aSign = extractFloat32Sign( a );
  1880 + bSig = extractFloat32Frac( b );
  1881 + bExp = extractFloat32Exp( b );
  1882 + bSign = extractFloat32Sign( b );
  1883 + if ( aExp == 0xFF ) {
  1884 + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
  1885 + return propagateFloat32NaN( a, b STATUS_VAR );
  1886 + }
  1887 + float_raise( float_flag_invalid STATUS_VAR);
  1888 + return float32_default_nan;
  1889 + }
  1890 + if ( bExp == 0xFF ) {
  1891 + if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
  1892 + return a;
  1893 + }
  1894 + if ( bExp == 0 ) {
  1895 + if ( bSig == 0 ) {
  1896 + float_raise( float_flag_invalid STATUS_VAR);
  1897 + return float32_default_nan;
  1898 + }
  1899 + normalizeFloat32Subnormal( bSig, &bExp, &bSig );
  1900 + }
  1901 + if ( aExp == 0 ) {
  1902 + if ( aSig == 0 ) return a;
  1903 + normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1904 + }
  1905 + expDiff = aExp - bExp;
  1906 + aSig |= 0x00800000;
  1907 + bSig |= 0x00800000;
  1908 + if ( expDiff < 32 ) {
  1909 + aSig <<= 8;
  1910 + bSig <<= 8;
  1911 + if ( expDiff < 0 ) {
  1912 + if ( expDiff < -1 ) return a;
  1913 + aSig >>= 1;
  1914 + }
  1915 + q = ( bSig <= aSig );
  1916 + if ( q ) aSig -= bSig;
  1917 + if ( 0 < expDiff ) {
  1918 + q = ( ( (bits64) aSig )<<32 ) / bSig;
  1919 + q >>= 32 - expDiff;
  1920 + bSig >>= 2;
  1921 + aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
  1922 + }
  1923 + else {
  1924 + aSig >>= 2;
  1925 + bSig >>= 2;
  1926 + }
  1927 + }
  1928 + else {
  1929 + if ( bSig <= aSig ) aSig -= bSig;
  1930 + aSig64 = ( (bits64) aSig )<<40;
  1931 + bSig64 = ( (bits64) bSig )<<40;
  1932 + expDiff -= 64;
  1933 + while ( 0 < expDiff ) {
  1934 + q64 = estimateDiv128To64( aSig64, 0, bSig64 );
  1935 + q64 = ( 2 < q64 ) ? q64 - 2 : 0;
  1936 + aSig64 = - ( ( bSig * q64 )<<38 );
  1937 + expDiff -= 62;
  1938 + }
  1939 + expDiff += 64;
  1940 + q64 = estimateDiv128To64( aSig64, 0, bSig64 );
  1941 + q64 = ( 2 < q64 ) ? q64 - 2 : 0;
  1942 + q = q64>>( 64 - expDiff );
  1943 + bSig <<= 6;
  1944 + aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
  1945 + }
  1946 + do {
  1947 + alternateASig = aSig;
  1948 + ++q;
  1949 + aSig -= bSig;
  1950 + } while ( 0 <= (sbits32) aSig );
  1951 + sigMean = aSig + alternateASig;
  1952 + if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
  1953 + aSig = alternateASig;
  1954 + }
  1955 + zSign = ( (sbits32) aSig < 0 );
  1956 + if ( zSign ) aSig = - aSig;
  1957 + return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
  1958 +
  1959 +}
  1960 +
  1961 +/*----------------------------------------------------------------------------
  1962 +| Returns the square root of the single-precision floating-point value `a'.
  1963 +| The operation is performed according to the IEC/IEEE Standard for Binary
  1964 +| Floating-Point Arithmetic.
  1965 +*----------------------------------------------------------------------------*/
  1966 +
  1967 +float32 float32_sqrt( float32 a STATUS_PARAM )
  1968 +{
  1969 + flag aSign;
  1970 + int16 aExp, zExp;
  1971 + bits32 aSig, zSig;
  1972 + bits64 rem, term;
  1973 +
  1974 + aSig = extractFloat32Frac( a );
  1975 + aExp = extractFloat32Exp( a );
  1976 + aSign = extractFloat32Sign( a );
  1977 + if ( aExp == 0xFF ) {
  1978 + if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
  1979 + if ( ! aSign ) return a;
  1980 + float_raise( float_flag_invalid STATUS_VAR);
  1981 + return float32_default_nan;
  1982 + }
  1983 + if ( aSign ) {
  1984 + if ( ( aExp | aSig ) == 0 ) return a;
  1985 + float_raise( float_flag_invalid STATUS_VAR);
  1986 + return float32_default_nan;
  1987 + }
  1988 + if ( aExp == 0 ) {
  1989 + if ( aSig == 0 ) return 0;
  1990 + normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1991 + }
  1992 + zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
  1993 + aSig = ( aSig | 0x00800000 )<<8;
  1994 + zSig = estimateSqrt32( aExp, aSig ) + 2;
  1995 + if ( ( zSig & 0x7F ) <= 5 ) {
  1996 + if ( zSig < 2 ) {
  1997 + zSig = 0x7FFFFFFF;
  1998 + goto roundAndPack;
  1999 + }
  2000 + aSig >>= aExp & 1;
  2001 + term = ( (bits64) zSig ) * zSig;
  2002 + rem = ( ( (bits64) aSig )<<32 ) - term;
  2003 + while ( (sbits64) rem < 0 ) {
  2004 + --zSig;
  2005 + rem += ( ( (bits64) zSig )<<1 ) | 1;
  2006 + }
  2007 + zSig |= ( rem != 0 );
  2008 + }
  2009 + shift32RightJamming( zSig, 1, &zSig );
  2010 + roundAndPack:
  2011 + return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
  2012 +
  2013 +}
  2014 +
  2015 +/*----------------------------------------------------------------------------
  2016 +| Returns 1 if the single-precision floating-point value `a' is equal to
  2017 +| the corresponding value `b', and 0 otherwise. The comparison is performed
  2018 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2019 +*----------------------------------------------------------------------------*/
  2020 +
  2021 +flag float32_eq( float32 a, float32 b STATUS_PARAM )
  2022 +{
  2023 +
  2024 + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2025 + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2026 + ) {
  2027 + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
  2028 + float_raise( float_flag_invalid STATUS_VAR);
  2029 + }
  2030 + return 0;
  2031 + }
  2032 + return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
  2033 +
  2034 +}
  2035 +
  2036 +/*----------------------------------------------------------------------------
  2037 +| Returns 1 if the single-precision floating-point value `a' is less than
  2038 +| or equal to the corresponding value `b', and 0 otherwise. The comparison
  2039 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2040 +| Arithmetic.
  2041 +*----------------------------------------------------------------------------*/
  2042 +
  2043 +flag float32_le( float32 a, float32 b STATUS_PARAM )
  2044 +{
  2045 + flag aSign, bSign;
  2046 +
  2047 + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2048 + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2049 + ) {
  2050 + float_raise( float_flag_invalid STATUS_VAR);
  2051 + return 0;
  2052 + }
  2053 + aSign = extractFloat32Sign( a );
  2054 + bSign = extractFloat32Sign( b );
  2055 + if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
  2056 + return ( a == b ) || ( aSign ^ ( a < b ) );
  2057 +
  2058 +}
  2059 +
  2060 +/*----------------------------------------------------------------------------
  2061 +| Returns 1 if the single-precision floating-point value `a' is less than
  2062 +| the corresponding value `b', and 0 otherwise. The comparison is performed
  2063 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2064 +*----------------------------------------------------------------------------*/
  2065 +
  2066 +flag float32_lt( float32 a, float32 b STATUS_PARAM )
  2067 +{
  2068 + flag aSign, bSign;
  2069 +
  2070 + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2071 + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2072 + ) {
  2073 + float_raise( float_flag_invalid STATUS_VAR);
  2074 + return 0;
  2075 + }
  2076 + aSign = extractFloat32Sign( a );
  2077 + bSign = extractFloat32Sign( b );
  2078 + if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
  2079 + return ( a != b ) && ( aSign ^ ( a < b ) );
  2080 +
  2081 +}
  2082 +
  2083 +/*----------------------------------------------------------------------------
  2084 +| Returns 1 if the single-precision floating-point value `a' is equal to
  2085 +| the corresponding value `b', and 0 otherwise. The invalid exception is
  2086 +| raised if either operand is a NaN. Otherwise, the comparison is performed
  2087 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2088 +*----------------------------------------------------------------------------*/
  2089 +
  2090 +flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
  2091 +{
  2092 +
  2093 + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2094 + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2095 + ) {
  2096 + float_raise( float_flag_invalid STATUS_VAR);
  2097 + return 0;
  2098 + }
  2099 + return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
  2100 +
  2101 +}
  2102 +
  2103 +/*----------------------------------------------------------------------------
  2104 +| Returns 1 if the single-precision floating-point value `a' is less than or
  2105 +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  2106 +| cause an exception. Otherwise, the comparison is performed according to the
  2107 +| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2108 +*----------------------------------------------------------------------------*/
  2109 +
  2110 +flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
  2111 +{
  2112 + flag aSign, bSign;
  2113 +
  2114 + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2115 + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2116 + ) {
  2117 + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
  2118 + float_raise( float_flag_invalid STATUS_VAR);
  2119 + }
  2120 + return 0;
  2121 + }
  2122 + aSign = extractFloat32Sign( a );
  2123 + bSign = extractFloat32Sign( b );
  2124 + if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
  2125 + return ( a == b ) || ( aSign ^ ( a < b ) );
  2126 +
  2127 +}
  2128 +
  2129 +/*----------------------------------------------------------------------------
  2130 +| Returns 1 if the single-precision floating-point value `a' is less than
  2131 +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  2132 +| exception. Otherwise, the comparison is performed according to the IEC/IEEE
  2133 +| Standard for Binary Floating-Point Arithmetic.
  2134 +*----------------------------------------------------------------------------*/
  2135 +
  2136 +flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
  2137 +{
  2138 + flag aSign, bSign;
  2139 +
  2140 + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2141 + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2142 + ) {
  2143 + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
  2144 + float_raise( float_flag_invalid STATUS_VAR);
  2145 + }
  2146 + return 0;
  2147 + }
  2148 + aSign = extractFloat32Sign( a );
  2149 + bSign = extractFloat32Sign( b );
  2150 + if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
  2151 + return ( a != b ) && ( aSign ^ ( a < b ) );
  2152 +
  2153 +}
  2154 +
  2155 +/*----------------------------------------------------------------------------
  2156 +| Returns the result of converting the double-precision floating-point value
  2157 +| `a' to the 32-bit two's complement integer format. The conversion is
  2158 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  2159 +| Arithmetic---which means in particular that the conversion is rounded
  2160 +| according to the current rounding mode. If `a' is a NaN, the largest
  2161 +| positive integer is returned. Otherwise, if the conversion overflows, the
  2162 +| largest integer with the same sign as `a' is returned.
  2163 +*----------------------------------------------------------------------------*/
  2164 +
  2165 +int32 float64_to_int32( float64 a STATUS_PARAM )
  2166 +{
  2167 + flag aSign;
  2168 + int16 aExp, shiftCount;
  2169 + bits64 aSig;
  2170 +
  2171 + aSig = extractFloat64Frac( a );
  2172 + aExp = extractFloat64Exp( a );
  2173 + aSign = extractFloat64Sign( a );
  2174 + if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
  2175 + if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
  2176 + shiftCount = 0x42C - aExp;
  2177 + if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
  2178 + return roundAndPackInt32( aSign, aSig STATUS_VAR );
  2179 +
  2180 +}
  2181 +
  2182 +/*----------------------------------------------------------------------------
  2183 +| Returns the result of converting the double-precision floating-point value
  2184 +| `a' to the 32-bit two's complement integer format. The conversion is
  2185 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  2186 +| Arithmetic, except that the conversion is always rounded toward zero.
  2187 +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2188 +| the conversion overflows, the largest integer with the same sign as `a' is
  2189 +| returned.
  2190 +*----------------------------------------------------------------------------*/
  2191 +
  2192 +int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
  2193 +{
  2194 + flag aSign;
  2195 + int16 aExp, shiftCount;
  2196 + bits64 aSig, savedASig;
  2197 + int32 z;
  2198 +
  2199 + aSig = extractFloat64Frac( a );
  2200 + aExp = extractFloat64Exp( a );
  2201 + aSign = extractFloat64Sign( a );
  2202 + if ( 0x41E < aExp ) {
  2203 + if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
  2204 + goto invalid;
  2205 + }
  2206 + else if ( aExp < 0x3FF ) {
  2207 + if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
  2208 + return 0;
  2209 + }
  2210 + aSig |= LIT64( 0x0010000000000000 );
  2211 + shiftCount = 0x433 - aExp;
  2212 + savedASig = aSig;
  2213 + aSig >>= shiftCount;
  2214 + z = aSig;
  2215 + if ( aSign ) z = - z;
  2216 + if ( ( z < 0 ) ^ aSign ) {
  2217 + invalid:
  2218 + float_raise( float_flag_invalid STATUS_VAR);
  2219 + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
  2220 + }
  2221 + if ( ( aSig<<shiftCount ) != savedASig ) {
  2222 + STATUS(float_exception_flags) |= float_flag_inexact;
  2223 + }
  2224 + return z;
  2225 +
  2226 +}
  2227 +
  2228 +/*----------------------------------------------------------------------------
  2229 +| Returns the result of converting the double-precision floating-point value
  2230 +| `a' to the 64-bit two's complement integer format. The conversion is
  2231 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  2232 +| Arithmetic---which means in particular that the conversion is rounded
  2233 +| according to the current rounding mode. If `a' is a NaN, the largest
  2234 +| positive integer is returned. Otherwise, if the conversion overflows, the
  2235 +| largest integer with the same sign as `a' is returned.
  2236 +*----------------------------------------------------------------------------*/
  2237 +
  2238 +int64 float64_to_int64( float64 a STATUS_PARAM )
  2239 +{
  2240 + flag aSign;
  2241 + int16 aExp, shiftCount;
  2242 + bits64 aSig, aSigExtra;
  2243 +
  2244 + aSig = extractFloat64Frac( a );
  2245 + aExp = extractFloat64Exp( a );
  2246 + aSign = extractFloat64Sign( a );
  2247 + if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
  2248 + shiftCount = 0x433 - aExp;
  2249 + if ( shiftCount <= 0 ) {
  2250 + if ( 0x43E < aExp ) {
  2251 + float_raise( float_flag_invalid STATUS_VAR);
  2252 + if ( ! aSign
  2253 + || ( ( aExp == 0x7FF )
  2254 + && ( aSig != LIT64( 0x0010000000000000 ) ) )
  2255 + ) {
  2256 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  2257 + }
  2258 + return (sbits64) LIT64( 0x8000000000000000 );
  2259 + }
  2260 + aSigExtra = 0;
  2261 + aSig <<= - shiftCount;
  2262 + }
  2263 + else {
  2264 + shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
  2265 + }
  2266 + return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
  2267 +
  2268 +}
  2269 +
  2270 +/*----------------------------------------------------------------------------
  2271 +| Returns the result of converting the double-precision floating-point value
  2272 +| `a' to the 64-bit two's complement integer format. The conversion is
  2273 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  2274 +| Arithmetic, except that the conversion is always rounded toward zero.
  2275 +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2276 +| the conversion overflows, the largest integer with the same sign as `a' is
  2277 +| returned.
  2278 +*----------------------------------------------------------------------------*/
  2279 +
  2280 +int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
  2281 +{
  2282 + flag aSign;
  2283 + int16 aExp, shiftCount;
  2284 + bits64 aSig;
  2285 + int64 z;
  2286 +
  2287 + aSig = extractFloat64Frac( a );
  2288 + aExp = extractFloat64Exp( a );
  2289 + aSign = extractFloat64Sign( a );
  2290 + if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
  2291 + shiftCount = aExp - 0x433;
  2292 + if ( 0 <= shiftCount ) {
  2293 + if ( 0x43E <= aExp ) {
  2294 + if ( a != LIT64( 0xC3E0000000000000 ) ) {
  2295 + float_raise( float_flag_invalid STATUS_VAR);
  2296 + if ( ! aSign
  2297 + || ( ( aExp == 0x7FF )
  2298 + && ( aSig != LIT64( 0x0010000000000000 ) ) )
  2299 + ) {
  2300 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  2301 + }
  2302 + }
  2303 + return (sbits64) LIT64( 0x8000000000000000 );
  2304 + }
  2305 + z = aSig<<shiftCount;
  2306 + }
  2307 + else {
  2308 + if ( aExp < 0x3FE ) {
  2309 + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
  2310 + return 0;
  2311 + }
  2312 + z = aSig>>( - shiftCount );
  2313 + if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
  2314 + STATUS(float_exception_flags) |= float_flag_inexact;
  2315 + }
  2316 + }
  2317 + if ( aSign ) z = - z;
  2318 + return z;
  2319 +
  2320 +}
  2321 +
  2322 +/*----------------------------------------------------------------------------
  2323 +| Returns the result of converting the double-precision floating-point value
  2324 +| `a' to the single-precision floating-point format. The conversion is
  2325 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  2326 +| Arithmetic.
  2327 +*----------------------------------------------------------------------------*/
  2328 +
  2329 +float32 float64_to_float32( float64 a STATUS_PARAM )
  2330 +{
  2331 + flag aSign;
  2332 + int16 aExp;
  2333 + bits64 aSig;
  2334 + bits32 zSig;
  2335 +
  2336 + aSig = extractFloat64Frac( a );
  2337 + aExp = extractFloat64Exp( a );
  2338 + aSign = extractFloat64Sign( a );
  2339 + if ( aExp == 0x7FF ) {
  2340 + if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
  2341 + return packFloat32( aSign, 0xFF, 0 );
  2342 + }
  2343 + shift64RightJamming( aSig, 22, &aSig );
  2344 + zSig = aSig;
  2345 + if ( aExp || zSig ) {
  2346 + zSig |= 0x40000000;
  2347 + aExp -= 0x381;
  2348 + }
  2349 + return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
  2350 +
  2351 +}
  2352 +
  2353 +#ifdef FLOATX80
  2354 +
  2355 +/*----------------------------------------------------------------------------
  2356 +| Returns the result of converting the double-precision floating-point value
  2357 +| `a' to the extended double-precision floating-point format. The conversion
  2358 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  2359 +| Arithmetic.
  2360 +*----------------------------------------------------------------------------*/
  2361 +
  2362 +floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
  2363 +{
  2364 + flag aSign;
  2365 + int16 aExp;
  2366 + bits64 aSig;
  2367 +
  2368 + aSig = extractFloat64Frac( a );
  2369 + aExp = extractFloat64Exp( a );
  2370 + aSign = extractFloat64Sign( a );
  2371 + if ( aExp == 0x7FF ) {
  2372 + if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
  2373 + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  2374 + }
  2375 + if ( aExp == 0 ) {
  2376 + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
  2377 + normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  2378 + }
  2379 + return
  2380 + packFloatx80(
  2381 + aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
  2382 +
  2383 +}
  2384 +
  2385 +#endif
  2386 +
  2387 +#ifdef FLOAT128
  2388 +
  2389 +/*----------------------------------------------------------------------------
  2390 +| Returns the result of converting the double-precision floating-point value
  2391 +| `a' to the quadruple-precision floating-point format. The conversion is
  2392 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  2393 +| Arithmetic.
  2394 +*----------------------------------------------------------------------------*/
  2395 +
  2396 +float128 float64_to_float128( float64 a STATUS_PARAM )
  2397 +{
  2398 + flag aSign;
  2399 + int16 aExp;
  2400 + bits64 aSig, zSig0, zSig1;
  2401 +
  2402 + aSig = extractFloat64Frac( a );
  2403 + aExp = extractFloat64Exp( a );
  2404 + aSign = extractFloat64Sign( a );
  2405 + if ( aExp == 0x7FF ) {
  2406 + if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
  2407 + return packFloat128( aSign, 0x7FFF, 0, 0 );
  2408 + }
  2409 + if ( aExp == 0 ) {
  2410 + if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
  2411 + normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  2412 + --aExp;
  2413 + }
  2414 + shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
  2415 + return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
  2416 +
  2417 +}
  2418 +
  2419 +#endif
  2420 +
  2421 +/*----------------------------------------------------------------------------
  2422 +| Rounds the double-precision floating-point value `a' to an integer, and
  2423 +| returns the result as a double-precision floating-point value. The
  2424 +| operation is performed according to the IEC/IEEE Standard for Binary
  2425 +| Floating-Point Arithmetic.
  2426 +*----------------------------------------------------------------------------*/
  2427 +
  2428 +float64 float64_round_to_int( float64 a STATUS_PARAM )
  2429 +{
  2430 + flag aSign;
  2431 + int16 aExp;
  2432 + bits64 lastBitMask, roundBitsMask;
  2433 + int8 roundingMode;
  2434 + float64 z;
  2435 +
  2436 + aExp = extractFloat64Exp( a );
  2437 + if ( 0x433 <= aExp ) {
  2438 + if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
  2439 + return propagateFloat64NaN( a, a STATUS_VAR );
  2440 + }
  2441 + return a;
  2442 + }
  2443 + if ( aExp < 0x3FF ) {
  2444 + if ( (bits64) ( a<<1 ) == 0 ) return a;
  2445 + STATUS(float_exception_flags) |= float_flag_inexact;
  2446 + aSign = extractFloat64Sign( a );
  2447 + switch ( STATUS(float_rounding_mode) ) {
  2448 + case float_round_nearest_even:
  2449 + if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
  2450 + return packFloat64( aSign, 0x3FF, 0 );
  2451 + }
  2452 + break;
  2453 + case float_round_down:
  2454 + return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
  2455 + case float_round_up:
  2456 + return
  2457 + aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
  2458 + }
  2459 + return packFloat64( aSign, 0, 0 );
  2460 + }
  2461 + lastBitMask = 1;
  2462 + lastBitMask <<= 0x433 - aExp;
  2463 + roundBitsMask = lastBitMask - 1;
  2464 + z = a;
  2465 + roundingMode = STATUS(float_rounding_mode);
  2466 + if ( roundingMode == float_round_nearest_even ) {
  2467 + z += lastBitMask>>1;
  2468 + if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
  2469 + }
  2470 + else if ( roundingMode != float_round_to_zero ) {
  2471 + if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
  2472 + z += roundBitsMask;
  2473 + }
  2474 + }
  2475 + z &= ~ roundBitsMask;
  2476 + if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
  2477 + return z;
  2478 +
  2479 +}
  2480 +
  2481 +/*----------------------------------------------------------------------------
  2482 +| Returns the result of adding the absolute values of the double-precision
  2483 +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  2484 +| before being returned. `zSign' is ignored if the result is a NaN.
  2485 +| The addition is performed according to the IEC/IEEE Standard for Binary
  2486 +| Floating-Point Arithmetic.
  2487 +*----------------------------------------------------------------------------*/
  2488 +
  2489 +static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
  2490 +{
  2491 + int16 aExp, bExp, zExp;
  2492 + bits64 aSig, bSig, zSig;
  2493 + int16 expDiff;
  2494 +
  2495 + aSig = extractFloat64Frac( a );
  2496 + aExp = extractFloat64Exp( a );
  2497 + bSig = extractFloat64Frac( b );
  2498 + bExp = extractFloat64Exp( b );
  2499 + expDiff = aExp - bExp;
  2500 + aSig <<= 9;
  2501 + bSig <<= 9;
  2502 + if ( 0 < expDiff ) {
  2503 + if ( aExp == 0x7FF ) {
  2504 + if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2505 + return a;
  2506 + }
  2507 + if ( bExp == 0 ) {
  2508 + --expDiff;
  2509 + }
  2510 + else {
  2511 + bSig |= LIT64( 0x2000000000000000 );
  2512 + }
  2513 + shift64RightJamming( bSig, expDiff, &bSig );
  2514 + zExp = aExp;
  2515 + }
  2516 + else if ( expDiff < 0 ) {
  2517 + if ( bExp == 0x7FF ) {
  2518 + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2519 + return packFloat64( zSign, 0x7FF, 0 );
  2520 + }
  2521 + if ( aExp == 0 ) {
  2522 + ++expDiff;
  2523 + }
  2524 + else {
  2525 + aSig |= LIT64( 0x2000000000000000 );
  2526 + }
  2527 + shift64RightJamming( aSig, - expDiff, &aSig );
  2528 + zExp = bExp;
  2529 + }
  2530 + else {
  2531 + if ( aExp == 0x7FF ) {
  2532 + if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2533 + return a;
  2534 + }
  2535 + if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
  2536 + zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
  2537 + zExp = aExp;
  2538 + goto roundAndPack;
  2539 + }
  2540 + aSig |= LIT64( 0x2000000000000000 );
  2541 + zSig = ( aSig + bSig )<<1;
  2542 + --zExp;
  2543 + if ( (sbits64) zSig < 0 ) {
  2544 + zSig = aSig + bSig;
  2545 + ++zExp;
  2546 + }
  2547 + roundAndPack:
  2548 + return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
  2549 +
  2550 +}
  2551 +
  2552 +/*----------------------------------------------------------------------------
  2553 +| Returns the result of subtracting the absolute values of the double-
  2554 +| precision floating-point values `a' and `b'. If `zSign' is 1, the
  2555 +| difference is negated before being returned. `zSign' is ignored if the
  2556 +| result is a NaN. The subtraction is performed according to the IEC/IEEE
  2557 +| Standard for Binary Floating-Point Arithmetic.
  2558 +*----------------------------------------------------------------------------*/
  2559 +
  2560 +static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
  2561 +{
  2562 + int16 aExp, bExp, zExp;
  2563 + bits64 aSig, bSig, zSig;
  2564 + int16 expDiff;
  2565 +
  2566 + aSig = extractFloat64Frac( a );
  2567 + aExp = extractFloat64Exp( a );
  2568 + bSig = extractFloat64Frac( b );
  2569 + bExp = extractFloat64Exp( b );
  2570 + expDiff = aExp - bExp;
  2571 + aSig <<= 10;
  2572 + bSig <<= 10;
  2573 + if ( 0 < expDiff ) goto aExpBigger;
  2574 + if ( expDiff < 0 ) goto bExpBigger;
  2575 + if ( aExp == 0x7FF ) {
  2576 + if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2577 + float_raise( float_flag_invalid STATUS_VAR);
  2578 + return float64_default_nan;
  2579 + }
  2580 + if ( aExp == 0 ) {
  2581 + aExp = 1;
  2582 + bExp = 1;
  2583 + }
  2584 + if ( bSig < aSig ) goto aBigger;
  2585 + if ( aSig < bSig ) goto bBigger;
  2586 + return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
  2587 + bExpBigger:
  2588 + if ( bExp == 0x7FF ) {
  2589 + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2590 + return packFloat64( zSign ^ 1, 0x7FF, 0 );
  2591 + }
  2592 + if ( aExp == 0 ) {
  2593 + ++expDiff;
  2594 + }
  2595 + else {
  2596 + aSig |= LIT64( 0x4000000000000000 );
  2597 + }
  2598 + shift64RightJamming( aSig, - expDiff, &aSig );
  2599 + bSig |= LIT64( 0x4000000000000000 );
  2600 + bBigger:
  2601 + zSig = bSig - aSig;
  2602 + zExp = bExp;
  2603 + zSign ^= 1;
  2604 + goto normalizeRoundAndPack;
  2605 + aExpBigger:
  2606 + if ( aExp == 0x7FF ) {
  2607 + if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2608 + return a;
  2609 + }
  2610 + if ( bExp == 0 ) {
  2611 + --expDiff;
  2612 + }
  2613 + else {
  2614 + bSig |= LIT64( 0x4000000000000000 );
  2615 + }
  2616 + shift64RightJamming( bSig, expDiff, &bSig );
  2617 + aSig |= LIT64( 0x4000000000000000 );
  2618 + aBigger:
  2619 + zSig = aSig - bSig;
  2620 + zExp = aExp;
  2621 + normalizeRoundAndPack:
  2622 + --zExp;
  2623 + return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
  2624 +
  2625 +}
  2626 +
  2627 +/*----------------------------------------------------------------------------
  2628 +| Returns the result of adding the double-precision floating-point values `a'
  2629 +| and `b'. The operation is performed according to the IEC/IEEE Standard for
  2630 +| Binary Floating-Point Arithmetic.
  2631 +*----------------------------------------------------------------------------*/
  2632 +
  2633 +float64 float64_add( float64 a, float64 b STATUS_PARAM )
  2634 +{
  2635 + flag aSign, bSign;
  2636 +
  2637 + aSign = extractFloat64Sign( a );
  2638 + bSign = extractFloat64Sign( b );
  2639 + if ( aSign == bSign ) {
  2640 + return addFloat64Sigs( a, b, aSign STATUS_VAR );
  2641 + }
  2642 + else {
  2643 + return subFloat64Sigs( a, b, aSign STATUS_VAR );
  2644 + }
  2645 +
  2646 +}
  2647 +
  2648 +/*----------------------------------------------------------------------------
  2649 +| Returns the result of subtracting the double-precision floating-point values
  2650 +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2651 +| for Binary Floating-Point Arithmetic.
  2652 +*----------------------------------------------------------------------------*/
  2653 +
  2654 +float64 float64_sub( float64 a, float64 b STATUS_PARAM )
  2655 +{
  2656 + flag aSign, bSign;
  2657 +
  2658 + aSign = extractFloat64Sign( a );
  2659 + bSign = extractFloat64Sign( b );
  2660 + if ( aSign == bSign ) {
  2661 + return subFloat64Sigs( a, b, aSign STATUS_VAR );
  2662 + }
  2663 + else {
  2664 + return addFloat64Sigs( a, b, aSign STATUS_VAR );
  2665 + }
  2666 +
  2667 +}
  2668 +
  2669 +/*----------------------------------------------------------------------------
  2670 +| Returns the result of multiplying the double-precision floating-point values
  2671 +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  2672 +| for Binary Floating-Point Arithmetic.
  2673 +*----------------------------------------------------------------------------*/
  2674 +
  2675 +float64 float64_mul( float64 a, float64 b STATUS_PARAM )
  2676 +{
  2677 + flag aSign, bSign, zSign;
  2678 + int16 aExp, bExp, zExp;
  2679 + bits64 aSig, bSig, zSig0, zSig1;
  2680 +
  2681 + aSig = extractFloat64Frac( a );
  2682 + aExp = extractFloat64Exp( a );
  2683 + aSign = extractFloat64Sign( a );
  2684 + bSig = extractFloat64Frac( b );
  2685 + bExp = extractFloat64Exp( b );
  2686 + bSign = extractFloat64Sign( b );
  2687 + zSign = aSign ^ bSign;
  2688 + if ( aExp == 0x7FF ) {
  2689 + if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
  2690 + return propagateFloat64NaN( a, b STATUS_VAR );
  2691 + }
  2692 + if ( ( bExp | bSig ) == 0 ) {
  2693 + float_raise( float_flag_invalid STATUS_VAR);
  2694 + return float64_default_nan;
  2695 + }
  2696 + return packFloat64( zSign, 0x7FF, 0 );
  2697 + }
  2698 + if ( bExp == 0x7FF ) {
  2699 + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2700 + if ( ( aExp | aSig ) == 0 ) {
  2701 + float_raise( float_flag_invalid STATUS_VAR);
  2702 + return float64_default_nan;
  2703 + }
  2704 + return packFloat64( zSign, 0x7FF, 0 );
  2705 + }
  2706 + if ( aExp == 0 ) {
  2707 + if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
  2708 + normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  2709 + }
  2710 + if ( bExp == 0 ) {
  2711 + if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
  2712 + normalizeFloat64Subnormal( bSig, &bExp, &bSig );
  2713 + }
  2714 + zExp = aExp + bExp - 0x3FF;
  2715 + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
  2716 + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
  2717 + mul64To128( aSig, bSig, &zSig0, &zSig1 );
  2718 + zSig0 |= ( zSig1 != 0 );
  2719 + if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
  2720 + zSig0 <<= 1;
  2721 + --zExp;
  2722 + }
  2723 + return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
  2724 +
  2725 +}
  2726 +
  2727 +/*----------------------------------------------------------------------------
  2728 +| Returns the result of dividing the double-precision floating-point value `a'
  2729 +| by the corresponding value `b'. The operation is performed according to
  2730 +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2731 +*----------------------------------------------------------------------------*/
  2732 +
  2733 +float64 float64_div( float64 a, float64 b STATUS_PARAM )
  2734 +{
  2735 + flag aSign, bSign, zSign;
  2736 + int16 aExp, bExp, zExp;
  2737 + bits64 aSig, bSig, zSig;
  2738 + bits64 rem0, rem1;
  2739 + bits64 term0, term1;
  2740 +
  2741 + aSig = extractFloat64Frac( a );
  2742 + aExp = extractFloat64Exp( a );
  2743 + aSign = extractFloat64Sign( a );
  2744 + bSig = extractFloat64Frac( b );
  2745 + bExp = extractFloat64Exp( b );
  2746 + bSign = extractFloat64Sign( b );
  2747 + zSign = aSign ^ bSign;
  2748 + if ( aExp == 0x7FF ) {
  2749 + if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2750 + if ( bExp == 0x7FF ) {
  2751 + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2752 + float_raise( float_flag_invalid STATUS_VAR);
  2753 + return float64_default_nan;
  2754 + }
  2755 + return packFloat64( zSign, 0x7FF, 0 );
  2756 + }
  2757 + if ( bExp == 0x7FF ) {
  2758 + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2759 + return packFloat64( zSign, 0, 0 );
  2760 + }
  2761 + if ( bExp == 0 ) {
  2762 + if ( bSig == 0 ) {
  2763 + if ( ( aExp | aSig ) == 0 ) {
  2764 + float_raise( float_flag_invalid STATUS_VAR);
  2765 + return float64_default_nan;
  2766 + }
  2767 + float_raise( float_flag_divbyzero STATUS_VAR);
  2768 + return packFloat64( zSign, 0x7FF, 0 );
  2769 + }
  2770 + normalizeFloat64Subnormal( bSig, &bExp, &bSig );
  2771 + }
  2772 + if ( aExp == 0 ) {
  2773 + if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
  2774 + normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  2775 + }
  2776 + zExp = aExp - bExp + 0x3FD;
  2777 + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
  2778 + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
  2779 + if ( bSig <= ( aSig + aSig ) ) {
  2780 + aSig >>= 1;
  2781 + ++zExp;
  2782 + }
  2783 + zSig = estimateDiv128To64( aSig, 0, bSig );
  2784 + if ( ( zSig & 0x1FF ) <= 2 ) {
  2785 + mul64To128( bSig, zSig, &term0, &term1 );
  2786 + sub128( aSig, 0, term0, term1, &rem0, &rem1 );
  2787 + while ( (sbits64) rem0 < 0 ) {
  2788 + --zSig;
  2789 + add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
  2790 + }
  2791 + zSig |= ( rem1 != 0 );
  2792 + }
  2793 + return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
  2794 +
  2795 +}
  2796 +
  2797 +/*----------------------------------------------------------------------------
  2798 +| Returns the remainder of the double-precision floating-point value `a'
  2799 +| with respect to the corresponding value `b'. The operation is performed
  2800 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2801 +*----------------------------------------------------------------------------*/
  2802 +
  2803 +float64 float64_rem( float64 a, float64 b STATUS_PARAM )
  2804 +{
  2805 + flag aSign, bSign, zSign;
  2806 + int16 aExp, bExp, expDiff;
  2807 + bits64 aSig, bSig;
  2808 + bits64 q, alternateASig;
  2809 + sbits64 sigMean;
  2810 +
  2811 + aSig = extractFloat64Frac( a );
  2812 + aExp = extractFloat64Exp( a );
  2813 + aSign = extractFloat64Sign( a );
  2814 + bSig = extractFloat64Frac( b );
  2815 + bExp = extractFloat64Exp( b );
  2816 + bSign = extractFloat64Sign( b );
  2817 + if ( aExp == 0x7FF ) {
  2818 + if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
  2819 + return propagateFloat64NaN( a, b STATUS_VAR );
  2820 + }
  2821 + float_raise( float_flag_invalid STATUS_VAR);
  2822 + return float64_default_nan;
  2823 + }
  2824 + if ( bExp == 0x7FF ) {
  2825 + if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
  2826 + return a;
  2827 + }
  2828 + if ( bExp == 0 ) {
  2829 + if ( bSig == 0 ) {
  2830 + float_raise( float_flag_invalid STATUS_VAR);
  2831 + return float64_default_nan;
  2832 + }
  2833 + normalizeFloat64Subnormal( bSig, &bExp, &bSig );
  2834 + }
  2835 + if ( aExp == 0 ) {
  2836 + if ( aSig == 0 ) return a;
  2837 + normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  2838 + }
  2839 + expDiff = aExp - bExp;
  2840 + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
  2841 + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
  2842 + if ( expDiff < 0 ) {
  2843 + if ( expDiff < -1 ) return a;
  2844 + aSig >>= 1;
  2845 + }
  2846 + q = ( bSig <= aSig );
  2847 + if ( q ) aSig -= bSig;
  2848 + expDiff -= 64;
  2849 + while ( 0 < expDiff ) {
  2850 + q = estimateDiv128To64( aSig, 0, bSig );
  2851 + q = ( 2 < q ) ? q - 2 : 0;
  2852 + aSig = - ( ( bSig>>2 ) * q );
  2853 + expDiff -= 62;
  2854 + }
  2855 + expDiff += 64;
  2856 + if ( 0 < expDiff ) {
  2857 + q = estimateDiv128To64( aSig, 0, bSig );
  2858 + q = ( 2 < q ) ? q - 2 : 0;
  2859 + q >>= 64 - expDiff;
  2860 + bSig >>= 2;
  2861 + aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
  2862 + }
  2863 + else {
  2864 + aSig >>= 2;
  2865 + bSig >>= 2;
  2866 + }
  2867 + do {
  2868 + alternateASig = aSig;
  2869 + ++q;
  2870 + aSig -= bSig;
  2871 + } while ( 0 <= (sbits64) aSig );
  2872 + sigMean = aSig + alternateASig;
  2873 + if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
  2874 + aSig = alternateASig;
  2875 + }
  2876 + zSign = ( (sbits64) aSig < 0 );
  2877 + if ( zSign ) aSig = - aSig;
  2878 + return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
  2879 +
  2880 +}
  2881 +
  2882 +/*----------------------------------------------------------------------------
  2883 +| Returns the square root of the double-precision floating-point value `a'.
  2884 +| The operation is performed according to the IEC/IEEE Standard for Binary
  2885 +| Floating-Point Arithmetic.
  2886 +*----------------------------------------------------------------------------*/
  2887 +
  2888 +float64 float64_sqrt( float64 a STATUS_PARAM )
  2889 +{
  2890 + flag aSign;
  2891 + int16 aExp, zExp;
  2892 + bits64 aSig, zSig, doubleZSig;
  2893 + bits64 rem0, rem1, term0, term1;
  2894 +
  2895 + aSig = extractFloat64Frac( a );
  2896 + aExp = extractFloat64Exp( a );
  2897 + aSign = extractFloat64Sign( a );
  2898 + if ( aExp == 0x7FF ) {
  2899 + if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
  2900 + if ( ! aSign ) return a;
  2901 + float_raise( float_flag_invalid STATUS_VAR);
  2902 + return float64_default_nan;
  2903 + }
  2904 + if ( aSign ) {
  2905 + if ( ( aExp | aSig ) == 0 ) return a;
  2906 + float_raise( float_flag_invalid STATUS_VAR);
  2907 + return float64_default_nan;
  2908 + }
  2909 + if ( aExp == 0 ) {
  2910 + if ( aSig == 0 ) return 0;
  2911 + normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  2912 + }
  2913 + zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
  2914 + aSig |= LIT64( 0x0010000000000000 );
  2915 + zSig = estimateSqrt32( aExp, aSig>>21 );
  2916 + aSig <<= 9 - ( aExp & 1 );
  2917 + zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
  2918 + if ( ( zSig & 0x1FF ) <= 5 ) {
  2919 + doubleZSig = zSig<<1;
  2920 + mul64To128( zSig, zSig, &term0, &term1 );
  2921 + sub128( aSig, 0, term0, term1, &rem0, &rem1 );
  2922 + while ( (sbits64) rem0 < 0 ) {
  2923 + --zSig;
  2924 + doubleZSig -= 2;
  2925 + add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
  2926 + }
  2927 + zSig |= ( ( rem0 | rem1 ) != 0 );
  2928 + }
  2929 + return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
  2930 +
  2931 +}
  2932 +
  2933 +/*----------------------------------------------------------------------------
  2934 +| Returns 1 if the double-precision floating-point value `a' is equal to the
  2935 +| corresponding value `b', and 0 otherwise. The comparison is performed
  2936 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2937 +*----------------------------------------------------------------------------*/
  2938 +
  2939 +flag float64_eq( float64 a, float64 b STATUS_PARAM )
  2940 +{
  2941 +
  2942 + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  2943 + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  2944 + ) {
  2945 + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
  2946 + float_raise( float_flag_invalid STATUS_VAR);
  2947 + }
  2948 + return 0;
  2949 + }
  2950 + return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
  2951 +
  2952 +}
  2953 +
  2954 +/*----------------------------------------------------------------------------
  2955 +| Returns 1 if the double-precision floating-point value `a' is less than or
  2956 +| equal to the corresponding value `b', and 0 otherwise. The comparison is
  2957 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  2958 +| Arithmetic.
  2959 +*----------------------------------------------------------------------------*/
  2960 +
  2961 +flag float64_le( float64 a, float64 b STATUS_PARAM )
  2962 +{
  2963 + flag aSign, bSign;
  2964 +
  2965 + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  2966 + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  2967 + ) {
  2968 + float_raise( float_flag_invalid STATUS_VAR);
  2969 + return 0;
  2970 + }
  2971 + aSign = extractFloat64Sign( a );
  2972 + bSign = extractFloat64Sign( b );
  2973 + if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
  2974 + return ( a == b ) || ( aSign ^ ( a < b ) );
  2975 +
  2976 +}
  2977 +
  2978 +/*----------------------------------------------------------------------------
  2979 +| Returns 1 if the double-precision floating-point value `a' is less than
  2980 +| the corresponding value `b', and 0 otherwise. The comparison is performed
  2981 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2982 +*----------------------------------------------------------------------------*/
  2983 +
  2984 +flag float64_lt( float64 a, float64 b STATUS_PARAM )
  2985 +{
  2986 + flag aSign, bSign;
  2987 +
  2988 + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  2989 + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  2990 + ) {
  2991 + float_raise( float_flag_invalid STATUS_VAR);
  2992 + return 0;
  2993 + }
  2994 + aSign = extractFloat64Sign( a );
  2995 + bSign = extractFloat64Sign( b );
  2996 + if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
  2997 + return ( a != b ) && ( aSign ^ ( a < b ) );
  2998 +
  2999 +}
  3000 +
  3001 +/*----------------------------------------------------------------------------
  3002 +| Returns 1 if the double-precision floating-point value `a' is equal to the
  3003 +| corresponding value `b', and 0 otherwise. The invalid exception is raised
  3004 +| if either operand is a NaN. Otherwise, the comparison is performed
  3005 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3006 +*----------------------------------------------------------------------------*/
  3007 +
  3008 +flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
  3009 +{
  3010 +
  3011 + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  3012 + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  3013 + ) {
  3014 + float_raise( float_flag_invalid STATUS_VAR);
  3015 + return 0;
  3016 + }
  3017 + return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
  3018 +
  3019 +}
  3020 +
  3021 +/*----------------------------------------------------------------------------
  3022 +| Returns 1 if the double-precision floating-point value `a' is less than or
  3023 +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  3024 +| cause an exception. Otherwise, the comparison is performed according to the
  3025 +| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3026 +*----------------------------------------------------------------------------*/
  3027 +
  3028 +flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
  3029 +{
  3030 + flag aSign, bSign;
  3031 +
  3032 + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  3033 + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  3034 + ) {
  3035 + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
  3036 + float_raise( float_flag_invalid STATUS_VAR);
  3037 + }
  3038 + return 0;
  3039 + }
  3040 + aSign = extractFloat64Sign( a );
  3041 + bSign = extractFloat64Sign( b );
  3042 + if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
  3043 + return ( a == b ) || ( aSign ^ ( a < b ) );
  3044 +
  3045 +}
  3046 +
  3047 +/*----------------------------------------------------------------------------
  3048 +| Returns 1 if the double-precision floating-point value `a' is less than
  3049 +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  3050 +| exception. Otherwise, the comparison is performed according to the IEC/IEEE
  3051 +| Standard for Binary Floating-Point Arithmetic.
  3052 +*----------------------------------------------------------------------------*/
  3053 +
  3054 +flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
  3055 +{
  3056 + flag aSign, bSign;
  3057 +
  3058 + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  3059 + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  3060 + ) {
  3061 + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
  3062 + float_raise( float_flag_invalid STATUS_VAR);
  3063 + }
  3064 + return 0;
  3065 + }
  3066 + aSign = extractFloat64Sign( a );
  3067 + bSign = extractFloat64Sign( b );
  3068 + if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
  3069 + return ( a != b ) && ( aSign ^ ( a < b ) );
  3070 +
  3071 +}
  3072 +
  3073 +#ifdef FLOATX80
  3074 +
  3075 +/*----------------------------------------------------------------------------
  3076 +| Returns the result of converting the extended double-precision floating-
  3077 +| point value `a' to the 32-bit two's complement integer format. The
  3078 +| conversion is performed according to the IEC/IEEE Standard for Binary
  3079 +| Floating-Point Arithmetic---which means in particular that the conversion
  3080 +| is rounded according to the current rounding mode. If `a' is a NaN, the
  3081 +| largest positive integer is returned. Otherwise, if the conversion
  3082 +| overflows, the largest integer with the same sign as `a' is returned.
  3083 +*----------------------------------------------------------------------------*/
  3084 +
  3085 +int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
  3086 +{
  3087 + flag aSign;
  3088 + int32 aExp, shiftCount;
  3089 + bits64 aSig;
  3090 +
  3091 + aSig = extractFloatx80Frac( a );
  3092 + aExp = extractFloatx80Exp( a );
  3093 + aSign = extractFloatx80Sign( a );
  3094 + if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
  3095 + shiftCount = 0x4037 - aExp;
  3096 + if ( shiftCount <= 0 ) shiftCount = 1;
  3097 + shift64RightJamming( aSig, shiftCount, &aSig );
  3098 + return roundAndPackInt32( aSign, aSig STATUS_VAR );
  3099 +
  3100 +}
  3101 +
  3102 +/*----------------------------------------------------------------------------
  3103 +| Returns the result of converting the extended double-precision floating-
  3104 +| point value `a' to the 32-bit two's complement integer format. The
  3105 +| conversion is performed according to the IEC/IEEE Standard for Binary
  3106 +| Floating-Point Arithmetic, except that the conversion is always rounded
  3107 +| toward zero. If `a' is a NaN, the largest positive integer is returned.
  3108 +| Otherwise, if the conversion overflows, the largest integer with the same
  3109 +| sign as `a' is returned.
  3110 +*----------------------------------------------------------------------------*/
  3111 +
  3112 +int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
  3113 +{
  3114 + flag aSign;
  3115 + int32 aExp, shiftCount;
  3116 + bits64 aSig, savedASig;
  3117 + int32 z;
  3118 +
  3119 + aSig = extractFloatx80Frac( a );
  3120 + aExp = extractFloatx80Exp( a );
  3121 + aSign = extractFloatx80Sign( a );
  3122 + if ( 0x401E < aExp ) {
  3123 + if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
  3124 + goto invalid;
  3125 + }
  3126 + else if ( aExp < 0x3FFF ) {
  3127 + if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
  3128 + return 0;
  3129 + }
  3130 + shiftCount = 0x403E - aExp;
  3131 + savedASig = aSig;
  3132 + aSig >>= shiftCount;
  3133 + z = aSig;
  3134 + if ( aSign ) z = - z;
  3135 + if ( ( z < 0 ) ^ aSign ) {
  3136 + invalid:
  3137 + float_raise( float_flag_invalid STATUS_VAR);
  3138 + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
  3139 + }
  3140 + if ( ( aSig<<shiftCount ) != savedASig ) {
  3141 + STATUS(float_exception_flags) |= float_flag_inexact;
  3142 + }
  3143 + return z;
  3144 +
  3145 +}
  3146 +
  3147 +/*----------------------------------------------------------------------------
  3148 +| Returns the result of converting the extended double-precision floating-
  3149 +| point value `a' to the 64-bit two's complement integer format. The
  3150 +| conversion is performed according to the IEC/IEEE Standard for Binary
  3151 +| Floating-Point Arithmetic---which means in particular that the conversion
  3152 +| is rounded according to the current rounding mode. If `a' is a NaN,
  3153 +| the largest positive integer is returned. Otherwise, if the conversion
  3154 +| overflows, the largest integer with the same sign as `a' is returned.
  3155 +*----------------------------------------------------------------------------*/
  3156 +
  3157 +int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
  3158 +{
  3159 + flag aSign;
  3160 + int32 aExp, shiftCount;
  3161 + bits64 aSig, aSigExtra;
  3162 +
  3163 + aSig = extractFloatx80Frac( a );
  3164 + aExp = extractFloatx80Exp( a );
  3165 + aSign = extractFloatx80Sign( a );
  3166 + shiftCount = 0x403E - aExp;
  3167 + if ( shiftCount <= 0 ) {
  3168 + if ( shiftCount ) {
  3169 + float_raise( float_flag_invalid STATUS_VAR);
  3170 + if ( ! aSign
  3171 + || ( ( aExp == 0x7FFF )
  3172 + && ( aSig != LIT64( 0x8000000000000000 ) ) )
  3173 + ) {
  3174 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  3175 + }
  3176 + return (sbits64) LIT64( 0x8000000000000000 );
  3177 + }
  3178 + aSigExtra = 0;
  3179 + }
  3180 + else {
  3181 + shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
  3182 + }
  3183 + return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
  3184 +
  3185 +}
  3186 +
  3187 +/*----------------------------------------------------------------------------
  3188 +| Returns the result of converting the extended double-precision floating-
  3189 +| point value `a' to the 64-bit two's complement integer format. The
  3190 +| conversion is performed according to the IEC/IEEE Standard for Binary
  3191 +| Floating-Point Arithmetic, except that the conversion is always rounded
  3192 +| toward zero. If `a' is a NaN, the largest positive integer is returned.
  3193 +| Otherwise, if the conversion overflows, the largest integer with the same
  3194 +| sign as `a' is returned.
  3195 +*----------------------------------------------------------------------------*/
  3196 +
  3197 +int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
  3198 +{
  3199 + flag aSign;
  3200 + int32 aExp, shiftCount;
  3201 + bits64 aSig;
  3202 + int64 z;
  3203 +
  3204 + aSig = extractFloatx80Frac( a );
  3205 + aExp = extractFloatx80Exp( a );
  3206 + aSign = extractFloatx80Sign( a );
  3207 + shiftCount = aExp - 0x403E;
  3208 + if ( 0 <= shiftCount ) {
  3209 + aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
  3210 + if ( ( a.high != 0xC03E ) || aSig ) {
  3211 + float_raise( float_flag_invalid STATUS_VAR);
  3212 + if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
  3213 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  3214 + }
  3215 + }
  3216 + return (sbits64) LIT64( 0x8000000000000000 );
  3217 + }
  3218 + else if ( aExp < 0x3FFF ) {
  3219 + if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
  3220 + return 0;
  3221 + }
  3222 + z = aSig>>( - shiftCount );
  3223 + if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
  3224 + STATUS(float_exception_flags) |= float_flag_inexact;
  3225 + }
  3226 + if ( aSign ) z = - z;
  3227 + return z;
  3228 +
  3229 +}
  3230 +
  3231 +/*----------------------------------------------------------------------------
  3232 +| Returns the result of converting the extended double-precision floating-
  3233 +| point value `a' to the single-precision floating-point format. The
  3234 +| conversion is performed according to the IEC/IEEE Standard for Binary
  3235 +| Floating-Point Arithmetic.
  3236 +*----------------------------------------------------------------------------*/
  3237 +
  3238 +float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
  3239 +{
  3240 + flag aSign;
  3241 + int32 aExp;
  3242 + bits64 aSig;
  3243 +
  3244 + aSig = extractFloatx80Frac( a );
  3245 + aExp = extractFloatx80Exp( a );
  3246 + aSign = extractFloatx80Sign( a );
  3247 + if ( aExp == 0x7FFF ) {
  3248 + if ( (bits64) ( aSig<<1 ) ) {
  3249 + return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
  3250 + }
  3251 + return packFloat32( aSign, 0xFF, 0 );
  3252 + }
  3253 + shift64RightJamming( aSig, 33, &aSig );
  3254 + if ( aExp || aSig ) aExp -= 0x3F81;
  3255 + return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
  3256 +
  3257 +}
  3258 +
  3259 +/*----------------------------------------------------------------------------
  3260 +| Returns the result of converting the extended double-precision floating-
  3261 +| point value `a' to the double-precision floating-point format. The
  3262 +| conversion is performed according to the IEC/IEEE Standard for Binary
  3263 +| Floating-Point Arithmetic.
  3264 +*----------------------------------------------------------------------------*/
  3265 +
  3266 +float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
  3267 +{
  3268 + flag aSign;
  3269 + int32 aExp;
  3270 + bits64 aSig, zSig;
  3271 +
  3272 + aSig = extractFloatx80Frac( a );
  3273 + aExp = extractFloatx80Exp( a );
  3274 + aSign = extractFloatx80Sign( a );
  3275 + if ( aExp == 0x7FFF ) {
  3276 + if ( (bits64) ( aSig<<1 ) ) {
  3277 + return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
  3278 + }
  3279 + return packFloat64( aSign, 0x7FF, 0 );
  3280 + }
  3281 + shift64RightJamming( aSig, 1, &zSig );
  3282 + if ( aExp || aSig ) aExp -= 0x3C01;
  3283 + return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
  3284 +
  3285 +}
  3286 +
  3287 +#ifdef FLOAT128
  3288 +
  3289 +/*----------------------------------------------------------------------------
  3290 +| Returns the result of converting the extended double-precision floating-
  3291 +| point value `a' to the quadruple-precision floating-point format. The
  3292 +| conversion is performed according to the IEC/IEEE Standard for Binary
  3293 +| Floating-Point Arithmetic.
  3294 +*----------------------------------------------------------------------------*/
  3295 +
  3296 +float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
  3297 +{
  3298 + flag aSign;
  3299 + int16 aExp;
  3300 + bits64 aSig, zSig0, zSig1;
  3301 +
  3302 + aSig = extractFloatx80Frac( a );
  3303 + aExp = extractFloatx80Exp( a );
  3304 + aSign = extractFloatx80Sign( a );
  3305 + if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
  3306 + return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
  3307 + }
  3308 + shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
  3309 + return packFloat128( aSign, aExp, zSig0, zSig1 );
  3310 +
  3311 +}
  3312 +
  3313 +#endif
  3314 +
  3315 +/*----------------------------------------------------------------------------
  3316 +| Rounds the extended double-precision floating-point value `a' to an integer,
  3317 +| and returns the result as an extended quadruple-precision floating-point
  3318 +| value. The operation is performed according to the IEC/IEEE Standard for
  3319 +| Binary Floating-Point Arithmetic.
  3320 +*----------------------------------------------------------------------------*/
  3321 +
  3322 +floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
  3323 +{
  3324 + flag aSign;
  3325 + int32 aExp;
  3326 + bits64 lastBitMask, roundBitsMask;
  3327 + int8 roundingMode;
  3328 + floatx80 z;
  3329 +
  3330 + aExp = extractFloatx80Exp( a );
  3331 + if ( 0x403E <= aExp ) {
  3332 + if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
  3333 + return propagateFloatx80NaN( a, a STATUS_VAR );
  3334 + }
  3335 + return a;
  3336 + }
  3337 + if ( aExp < 0x3FFF ) {
  3338 + if ( ( aExp == 0 )
  3339 + && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
  3340 + return a;
  3341 + }
  3342 + STATUS(float_exception_flags) |= float_flag_inexact;
  3343 + aSign = extractFloatx80Sign( a );
  3344 + switch ( STATUS(float_rounding_mode) ) {
  3345 + case float_round_nearest_even:
  3346 + if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
  3347 + ) {
  3348 + return
  3349 + packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
  3350 + }
  3351 + break;
  3352 + case float_round_down:
  3353 + return
  3354 + aSign ?
  3355 + packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
  3356 + : packFloatx80( 0, 0, 0 );
  3357 + case float_round_up:
  3358 + return
  3359 + aSign ? packFloatx80( 1, 0, 0 )
  3360 + : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
  3361 + }
  3362 + return packFloatx80( aSign, 0, 0 );
  3363 + }
  3364 + lastBitMask = 1;
  3365 + lastBitMask <<= 0x403E - aExp;
  3366 + roundBitsMask = lastBitMask - 1;
  3367 + z = a;
  3368 + roundingMode = STATUS(float_rounding_mode);
  3369 + if ( roundingMode == float_round_nearest_even ) {
  3370 + z.low += lastBitMask>>1;
  3371 + if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
  3372 + }
  3373 + else if ( roundingMode != float_round_to_zero ) {
  3374 + if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
  3375 + z.low += roundBitsMask;
  3376 + }
  3377 + }
  3378 + z.low &= ~ roundBitsMask;
  3379 + if ( z.low == 0 ) {
  3380 + ++z.high;
  3381 + z.low = LIT64( 0x8000000000000000 );
  3382 + }
  3383 + if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
  3384 + return z;
  3385 +
  3386 +}
  3387 +
  3388 +/*----------------------------------------------------------------------------
  3389 +| Returns the result of adding the absolute values of the extended double-
  3390 +| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  3391 +| negated before being returned. `zSign' is ignored if the result is a NaN.
  3392 +| The addition is performed according to the IEC/IEEE Standard for Binary
  3393 +| Floating-Point Arithmetic.
  3394 +*----------------------------------------------------------------------------*/
  3395 +
  3396 +static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
  3397 +{
  3398 + int32 aExp, bExp, zExp;
  3399 + bits64 aSig, bSig, zSig0, zSig1;
  3400 + int32 expDiff;
  3401 +
  3402 + aSig = extractFloatx80Frac( a );
  3403 + aExp = extractFloatx80Exp( a );
  3404 + bSig = extractFloatx80Frac( b );
  3405 + bExp = extractFloatx80Exp( b );
  3406 + expDiff = aExp - bExp;
  3407 + if ( 0 < expDiff ) {
  3408 + if ( aExp == 0x7FFF ) {
  3409 + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3410 + return a;
  3411 + }
  3412 + if ( bExp == 0 ) --expDiff;
  3413 + shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  3414 + zExp = aExp;
  3415 + }
  3416 + else if ( expDiff < 0 ) {
  3417 + if ( bExp == 0x7FFF ) {
  3418 + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3419 + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  3420 + }
  3421 + if ( aExp == 0 ) ++expDiff;
  3422 + shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  3423 + zExp = bExp;
  3424 + }
  3425 + else {
  3426 + if ( aExp == 0x7FFF ) {
  3427 + if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
  3428 + return propagateFloatx80NaN( a, b STATUS_VAR );
  3429 + }
  3430 + return a;
  3431 + }
  3432 + zSig1 = 0;
  3433 + zSig0 = aSig + bSig;
  3434 + if ( aExp == 0 ) {
  3435 + normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
  3436 + goto roundAndPack;
  3437 + }
  3438 + zExp = aExp;
  3439 + goto shiftRight1;
  3440 + }
  3441 + zSig0 = aSig + bSig;
  3442 + if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
  3443 + shiftRight1:
  3444 + shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
  3445 + zSig0 |= LIT64( 0x8000000000000000 );
  3446 + ++zExp;
  3447 + roundAndPack:
  3448 + return
  3449 + roundAndPackFloatx80(
  3450 + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
  3451 +
  3452 +}
  3453 +
  3454 +/*----------------------------------------------------------------------------
  3455 +| Returns the result of subtracting the absolute values of the extended
  3456 +| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  3457 +| difference is negated before being returned. `zSign' is ignored if the
  3458 +| result is a NaN. The subtraction is performed according to the IEC/IEEE
  3459 +| Standard for Binary Floating-Point Arithmetic.
  3460 +*----------------------------------------------------------------------------*/
  3461 +
  3462 +static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
  3463 +{
  3464 + int32 aExp, bExp, zExp;
  3465 + bits64 aSig, bSig, zSig0, zSig1;
  3466 + int32 expDiff;
  3467 + floatx80 z;
  3468 +
  3469 + aSig = extractFloatx80Frac( a );
  3470 + aExp = extractFloatx80Exp( a );
  3471 + bSig = extractFloatx80Frac( b );
  3472 + bExp = extractFloatx80Exp( b );
  3473 + expDiff = aExp - bExp;
  3474 + if ( 0 < expDiff ) goto aExpBigger;
  3475 + if ( expDiff < 0 ) goto bExpBigger;
  3476 + if ( aExp == 0x7FFF ) {
  3477 + if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
  3478 + return propagateFloatx80NaN( a, b STATUS_VAR );
  3479 + }
  3480 + float_raise( float_flag_invalid STATUS_VAR);
  3481 + z.low = floatx80_default_nan_low;
  3482 + z.high = floatx80_default_nan_high;
  3483 + return z;
  3484 + }
  3485 + if ( aExp == 0 ) {
  3486 + aExp = 1;
  3487 + bExp = 1;
  3488 + }
  3489 + zSig1 = 0;
  3490 + if ( bSig < aSig ) goto aBigger;
  3491 + if ( aSig < bSig ) goto bBigger;
  3492 + return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
  3493 + bExpBigger:
  3494 + if ( bExp == 0x7FFF ) {
  3495 + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3496 + return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
  3497 + }
  3498 + if ( aExp == 0 ) ++expDiff;
  3499 + shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  3500 + bBigger:
  3501 + sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
  3502 + zExp = bExp;
  3503 + zSign ^= 1;
  3504 + goto normalizeRoundAndPack;
  3505 + aExpBigger:
  3506 + if ( aExp == 0x7FFF ) {
  3507 + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3508 + return a;
  3509 + }
  3510 + if ( bExp == 0 ) --expDiff;
  3511 + shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  3512 + aBigger:
  3513 + sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
  3514 + zExp = aExp;
  3515 + normalizeRoundAndPack:
  3516 + return
  3517 + normalizeRoundAndPackFloatx80(
  3518 + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
  3519 +
  3520 +}
  3521 +
  3522 +/*----------------------------------------------------------------------------
  3523 +| Returns the result of adding the extended double-precision floating-point
  3524 +| values `a' and `b'. The operation is performed according to the IEC/IEEE
  3525 +| Standard for Binary Floating-Point Arithmetic.
  3526 +*----------------------------------------------------------------------------*/
  3527 +
  3528 +floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
  3529 +{
  3530 + flag aSign, bSign;
  3531 +
  3532 + aSign = extractFloatx80Sign( a );
  3533 + bSign = extractFloatx80Sign( b );
  3534 + if ( aSign == bSign ) {
  3535 + return addFloatx80Sigs( a, b, aSign STATUS_VAR );
  3536 + }
  3537 + else {
  3538 + return subFloatx80Sigs( a, b, aSign STATUS_VAR );
  3539 + }
  3540 +
  3541 +}
  3542 +
  3543 +/*----------------------------------------------------------------------------
  3544 +| Returns the result of subtracting the extended double-precision floating-
  3545 +| point values `a' and `b'. The operation is performed according to the
  3546 +| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3547 +*----------------------------------------------------------------------------*/
  3548 +
  3549 +floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
  3550 +{
  3551 + flag aSign, bSign;
  3552 +
  3553 + aSign = extractFloatx80Sign( a );
  3554 + bSign = extractFloatx80Sign( b );
  3555 + if ( aSign == bSign ) {
  3556 + return subFloatx80Sigs( a, b, aSign STATUS_VAR );
  3557 + }
  3558 + else {
  3559 + return addFloatx80Sigs( a, b, aSign STATUS_VAR );
  3560 + }
  3561 +
  3562 +}
  3563 +
  3564 +/*----------------------------------------------------------------------------
  3565 +| Returns the result of multiplying the extended double-precision floating-
  3566 +| point values `a' and `b'. The operation is performed according to the
  3567 +| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3568 +*----------------------------------------------------------------------------*/
  3569 +
  3570 +floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
  3571 +{
  3572 + flag aSign, bSign, zSign;
  3573 + int32 aExp, bExp, zExp;
  3574 + bits64 aSig, bSig, zSig0, zSig1;
  3575 + floatx80 z;
  3576 +
  3577 + aSig = extractFloatx80Frac( a );
  3578 + aExp = extractFloatx80Exp( a );
  3579 + aSign = extractFloatx80Sign( a );
  3580 + bSig = extractFloatx80Frac( b );
  3581 + bExp = extractFloatx80Exp( b );
  3582 + bSign = extractFloatx80Sign( b );
  3583 + zSign = aSign ^ bSign;
  3584 + if ( aExp == 0x7FFF ) {
  3585 + if ( (bits64) ( aSig<<1 )
  3586 + || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
  3587 + return propagateFloatx80NaN( a, b STATUS_VAR );
  3588 + }
  3589 + if ( ( bExp | bSig ) == 0 ) goto invalid;
  3590 + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  3591 + }
  3592 + if ( bExp == 0x7FFF ) {
  3593 + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3594 + if ( ( aExp | aSig ) == 0 ) {
  3595 + invalid:
  3596 + float_raise( float_flag_invalid STATUS_VAR);
  3597 + z.low = floatx80_default_nan_low;
  3598 + z.high = floatx80_default_nan_high;
  3599 + return z;
  3600 + }
  3601 + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  3602 + }
  3603 + if ( aExp == 0 ) {
  3604 + if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
  3605 + normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  3606 + }
  3607 + if ( bExp == 0 ) {
  3608 + if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
  3609 + normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  3610 + }
  3611 + zExp = aExp + bExp - 0x3FFE;
  3612 + mul64To128( aSig, bSig, &zSig0, &zSig1 );
  3613 + if ( 0 < (sbits64) zSig0 ) {
  3614 + shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
  3615 + --zExp;
  3616 + }
  3617 + return
  3618 + roundAndPackFloatx80(
  3619 + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
  3620 +
  3621 +}
  3622 +
  3623 +/*----------------------------------------------------------------------------
  3624 +| Returns the result of dividing the extended double-precision floating-point
  3625 +| value `a' by the corresponding value `b'. The operation is performed
  3626 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3627 +*----------------------------------------------------------------------------*/
  3628 +
  3629 +floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
  3630 +{
  3631 + flag aSign, bSign, zSign;
  3632 + int32 aExp, bExp, zExp;
  3633 + bits64 aSig, bSig, zSig0, zSig1;
  3634 + bits64 rem0, rem1, rem2, term0, term1, term2;
  3635 + floatx80 z;
  3636 +
  3637 + aSig = extractFloatx80Frac( a );
  3638 + aExp = extractFloatx80Exp( a );
  3639 + aSign = extractFloatx80Sign( a );
  3640 + bSig = extractFloatx80Frac( b );
  3641 + bExp = extractFloatx80Exp( b );
  3642 + bSign = extractFloatx80Sign( b );
  3643 + zSign = aSign ^ bSign;
  3644 + if ( aExp == 0x7FFF ) {
  3645 + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3646 + if ( bExp == 0x7FFF ) {
  3647 + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3648 + goto invalid;
  3649 + }
  3650 + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  3651 + }
  3652 + if ( bExp == 0x7FFF ) {
  3653 + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3654 + return packFloatx80( zSign, 0, 0 );
  3655 + }
  3656 + if ( bExp == 0 ) {
  3657 + if ( bSig == 0 ) {
  3658 + if ( ( aExp | aSig ) == 0 ) {
  3659 + invalid:
  3660 + float_raise( float_flag_invalid STATUS_VAR);
  3661 + z.low = floatx80_default_nan_low;
  3662 + z.high = floatx80_default_nan_high;
  3663 + return z;
  3664 + }
  3665 + float_raise( float_flag_divbyzero STATUS_VAR);
  3666 + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  3667 + }
  3668 + normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  3669 + }
  3670 + if ( aExp == 0 ) {
  3671 + if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
  3672 + normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  3673 + }
  3674 + zExp = aExp - bExp + 0x3FFE;
  3675 + rem1 = 0;
  3676 + if ( bSig <= aSig ) {
  3677 + shift128Right( aSig, 0, 1, &aSig, &rem1 );
  3678 + ++zExp;
  3679 + }
  3680 + zSig0 = estimateDiv128To64( aSig, rem1, bSig );
  3681 + mul64To128( bSig, zSig0, &term0, &term1 );
  3682 + sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
  3683 + while ( (sbits64) rem0 < 0 ) {
  3684 + --zSig0;
  3685 + add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
  3686 + }
  3687 + zSig1 = estimateDiv128To64( rem1, 0, bSig );
  3688 + if ( (bits64) ( zSig1<<1 ) <= 8 ) {
  3689 + mul64To128( bSig, zSig1, &term1, &term2 );
  3690 + sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  3691 + while ( (sbits64) rem1 < 0 ) {
  3692 + --zSig1;
  3693 + add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
  3694 + }
  3695 + zSig1 |= ( ( rem1 | rem2 ) != 0 );
  3696 + }
  3697 + return
  3698 + roundAndPackFloatx80(
  3699 + STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
  3700 +
  3701 +}
  3702 +
  3703 +/*----------------------------------------------------------------------------
  3704 +| Returns the remainder of the extended double-precision floating-point value
  3705 +| `a' with respect to the corresponding value `b'. The operation is performed
  3706 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3707 +*----------------------------------------------------------------------------*/
  3708 +
  3709 +floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
  3710 +{
  3711 + flag aSign, bSign, zSign;
  3712 + int32 aExp, bExp, expDiff;
  3713 + bits64 aSig0, aSig1, bSig;
  3714 + bits64 q, term0, term1, alternateASig0, alternateASig1;
  3715 + floatx80 z;
  3716 +
  3717 + aSig0 = extractFloatx80Frac( a );
  3718 + aExp = extractFloatx80Exp( a );
  3719 + aSign = extractFloatx80Sign( a );
  3720 + bSig = extractFloatx80Frac( b );
  3721 + bExp = extractFloatx80Exp( b );
  3722 + bSign = extractFloatx80Sign( b );
  3723 + if ( aExp == 0x7FFF ) {
  3724 + if ( (bits64) ( aSig0<<1 )
  3725 + || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
  3726 + return propagateFloatx80NaN( a, b STATUS_VAR );
  3727 + }
  3728 + goto invalid;
  3729 + }
  3730 + if ( bExp == 0x7FFF ) {
  3731 + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
  3732 + return a;
  3733 + }
  3734 + if ( bExp == 0 ) {
  3735 + if ( bSig == 0 ) {
  3736 + invalid:
  3737 + float_raise( float_flag_invalid STATUS_VAR);
  3738 + z.low = floatx80_default_nan_low;
  3739 + z.high = floatx80_default_nan_high;
  3740 + return z;
  3741 + }
  3742 + normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  3743 + }
  3744 + if ( aExp == 0 ) {
  3745 + if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
  3746 + normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  3747 + }
  3748 + bSig |= LIT64( 0x8000000000000000 );
  3749 + zSign = aSign;
  3750 + expDiff = aExp - bExp;
  3751 + aSig1 = 0;
  3752 + if ( expDiff < 0 ) {
  3753 + if ( expDiff < -1 ) return a;
  3754 + shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
  3755 + expDiff = 0;
  3756 + }
  3757 + q = ( bSig <= aSig0 );
  3758 + if ( q ) aSig0 -= bSig;
  3759 + expDiff -= 64;
  3760 + while ( 0 < expDiff ) {
  3761 + q = estimateDiv128To64( aSig0, aSig1, bSig );
  3762 + q = ( 2 < q ) ? q - 2 : 0;
  3763 + mul64To128( bSig, q, &term0, &term1 );
  3764 + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  3765 + shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
  3766 + expDiff -= 62;
  3767 + }
  3768 + expDiff += 64;
  3769 + if ( 0 < expDiff ) {
  3770 + q = estimateDiv128To64( aSig0, aSig1, bSig );
  3771 + q = ( 2 < q ) ? q - 2 : 0;
  3772 + q >>= 64 - expDiff;
  3773 + mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
  3774 + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  3775 + shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
  3776 + while ( le128( term0, term1, aSig0, aSig1 ) ) {
  3777 + ++q;
  3778 + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  3779 + }
  3780 + }
  3781 + else {
  3782 + term1 = 0;
  3783 + term0 = bSig;
  3784 + }
  3785 + sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
  3786 + if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  3787 + || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  3788 + && ( q & 1 ) )
  3789 + ) {
  3790 + aSig0 = alternateASig0;
  3791 + aSig1 = alternateASig1;
  3792 + zSign = ! zSign;
  3793 + }
  3794 + return
  3795 + normalizeRoundAndPackFloatx80(
  3796 + 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
  3797 +
  3798 +}
  3799 +
  3800 +/*----------------------------------------------------------------------------
  3801 +| Returns the square root of the extended double-precision floating-point
  3802 +| value `a'. The operation is performed according to the IEC/IEEE Standard
  3803 +| for Binary Floating-Point Arithmetic.
  3804 +*----------------------------------------------------------------------------*/
  3805 +
  3806 +floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
  3807 +{
  3808 + flag aSign;
  3809 + int32 aExp, zExp;
  3810 + bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
  3811 + bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
  3812 + floatx80 z;
  3813 +
  3814 + aSig0 = extractFloatx80Frac( a );
  3815 + aExp = extractFloatx80Exp( a );
  3816 + aSign = extractFloatx80Sign( a );
  3817 + if ( aExp == 0x7FFF ) {
  3818 + if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
  3819 + if ( ! aSign ) return a;
  3820 + goto invalid;
  3821 + }
  3822 + if ( aSign ) {
  3823 + if ( ( aExp | aSig0 ) == 0 ) return a;
  3824 + invalid:
  3825 + float_raise( float_flag_invalid STATUS_VAR);
  3826 + z.low = floatx80_default_nan_low;
  3827 + z.high = floatx80_default_nan_high;
  3828 + return z;
  3829 + }
  3830 + if ( aExp == 0 ) {
  3831 + if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
  3832 + normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  3833 + }
  3834 + zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
  3835 + zSig0 = estimateSqrt32( aExp, aSig0>>32 );
  3836 + shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
  3837 + zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
  3838 + doubleZSig0 = zSig0<<1;
  3839 + mul64To128( zSig0, zSig0, &term0, &term1 );
  3840 + sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  3841 + while ( (sbits64) rem0 < 0 ) {
  3842 + --zSig0;
  3843 + doubleZSig0 -= 2;
  3844 + add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
  3845 + }
  3846 + zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
  3847 + if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
  3848 + if ( zSig1 == 0 ) zSig1 = 1;
  3849 + mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  3850 + sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  3851 + mul64To128( zSig1, zSig1, &term2, &term3 );
  3852 + sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  3853 + while ( (sbits64) rem1 < 0 ) {
  3854 + --zSig1;
  3855 + shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  3856 + term3 |= 1;
  3857 + term2 |= doubleZSig0;
  3858 + add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  3859 + }
  3860 + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
  3861 + }
  3862 + shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
  3863 + zSig0 |= doubleZSig0;
  3864 + return
  3865 + roundAndPackFloatx80(
  3866 + STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
  3867 +
  3868 +}
  3869 +
  3870 +/*----------------------------------------------------------------------------
  3871 +| Returns 1 if the extended double-precision floating-point value `a' is
  3872 +| equal to the corresponding value `b', and 0 otherwise. The comparison is
  3873 +| performed according to the IEC/IEEE Standard for Binary Floating-Point
  3874 +| Arithmetic.
  3875 +*----------------------------------------------------------------------------*/
  3876 +
  3877 +flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
  3878 +{
  3879 +
  3880 + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  3881 + && (bits64) ( extractFloatx80Frac( a )<<1 ) )
  3882 + || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  3883 + && (bits64) ( extractFloatx80Frac( b )<<1 ) )
  3884 + ) {
  3885 + if ( floatx80_is_signaling_nan( a )
  3886 + || floatx80_is_signaling_nan( b ) ) {
  3887 + float_raise( float_flag_invalid STATUS_VAR);
  3888 + }
  3889 + return 0;
  3890 + }
  3891 + return
  3892 + ( a.low == b.low )
  3893 + && ( ( a.high == b.high )
  3894 + || ( ( a.low == 0 )
  3895 + && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
  3896 + );
  3897 +
  3898 +}
  3899 +
  3900 +/*----------------------------------------------------------------------------
  3901 +| Returns 1 if the extended double-precision floating-point value `a' is
  3902 +| less than or equal to the corresponding value `b', and 0 otherwise. The
  3903 +| comparison is performed according to the IEC/IEEE Standard for Binary
  3904 +| Floating-Point Arithmetic.
  3905 +*----------------------------------------------------------------------------*/
  3906 +
  3907 +flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
  3908 +{
  3909 + flag aSign, bSign;
  3910 +
  3911 + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  3912 + && (bits64) ( extractFloatx80Frac( a )<<1 ) )
  3913 + || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  3914 + && (bits64) ( extractFloatx80Frac( b )<<1 ) )
  3915 + ) {
  3916 + float_raise( float_flag_invalid STATUS_VAR);
  3917 + return 0;
  3918 + }
  3919 + aSign = extractFloatx80Sign( a );
  3920 + bSign = extractFloatx80Sign( b );
  3921 + if ( aSign != bSign ) {
  3922 + return
  3923 + aSign
  3924 + || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  3925 + == 0 );
  3926 + }
  3927 + return
  3928 + aSign ? le128( b.high, b.low, a.high, a.low )
  3929 + : le128( a.high, a.low, b.high, b.low );
  3930 +
  3931 +}
  3932 +
  3933 +/*----------------------------------------------------------------------------
  3934 +| Returns 1 if the extended double-precision floating-point value `a' is
  3935 +| less than the corresponding value `b', and 0 otherwise. The comparison
  3936 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3937 +| Arithmetic.
  3938 +*----------------------------------------------------------------------------*/
  3939 +
  3940 +flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
  3941 +{
  3942 + flag aSign, bSign;
  3943 +
  3944 + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  3945 + && (bits64) ( extractFloatx80Frac( a )<<1 ) )
  3946 + || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  3947 + && (bits64) ( extractFloatx80Frac( b )<<1 ) )
  3948 + ) {
  3949 + float_raise( float_flag_invalid STATUS_VAR);
  3950 + return 0;
  3951 + }
  3952 + aSign = extractFloatx80Sign( a );
  3953 + bSign = extractFloatx80Sign( b );
  3954 + if ( aSign != bSign ) {
  3955 + return
  3956 + aSign
  3957 + && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  3958 + != 0 );
  3959 + }
  3960 + return
  3961 + aSign ? lt128( b.high, b.low, a.high, a.low )
  3962 + : lt128( a.high, a.low, b.high, b.low );
  3963 +
  3964 +}
  3965 +
  3966 +/*----------------------------------------------------------------------------
  3967 +| Returns 1 if the extended double-precision floating-point value `a' is equal
  3968 +| to the corresponding value `b', and 0 otherwise. The invalid exception is
  3969 +| raised if either operand is a NaN. Otherwise, the comparison is performed
  3970 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3971 +*----------------------------------------------------------------------------*/
  3972 +
  3973 +flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
  3974 +{
  3975 +
  3976 + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  3977 + && (bits64) ( extractFloatx80Frac( a )<<1 ) )
  3978 + || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  3979 + && (bits64) ( extractFloatx80Frac( b )<<1 ) )
  3980 + ) {
  3981 + float_raise( float_flag_invalid STATUS_VAR);
  3982 + return 0;
  3983 + }
  3984 + return
  3985 + ( a.low == b.low )
  3986 + && ( ( a.high == b.high )
  3987 + || ( ( a.low == 0 )
  3988 + && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
  3989 + );
  3990 +
  3991 +}
  3992 +
  3993 +/*----------------------------------------------------------------------------
  3994 +| Returns 1 if the extended double-precision floating-point value `a' is less
  3995 +| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  3996 +| do not cause an exception. Otherwise, the comparison is performed according
  3997 +| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3998 +*----------------------------------------------------------------------------*/
  3999 +
  4000 +flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
  4001 +{
  4002 + flag aSign, bSign;
  4003 +
  4004 + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  4005 + && (bits64) ( extractFloatx80Frac( a )<<1 ) )
  4006 + || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  4007 + && (bits64) ( extractFloatx80Frac( b )<<1 ) )
  4008 + ) {
  4009 + if ( floatx80_is_signaling_nan( a )
  4010 + || floatx80_is_signaling_nan( b ) ) {
  4011 + float_raise( float_flag_invalid STATUS_VAR);
  4012 + }
  4013 + return 0;
  4014 + }
  4015 + aSign = extractFloatx80Sign( a );
  4016 + bSign = extractFloatx80Sign( b );
  4017 + if ( aSign != bSign ) {
  4018 + return
  4019 + aSign
  4020 + || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  4021 + == 0 );
  4022 + }
  4023 + return
  4024 + aSign ? le128( b.high, b.low, a.high, a.low )
  4025 + : le128( a.high, a.low, b.high, b.low );
  4026 +
  4027 +}
  4028 +
  4029 +/*----------------------------------------------------------------------------
  4030 +| Returns 1 if the extended double-precision floating-point value `a' is less
  4031 +| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  4032 +| an exception. Otherwise, the comparison is performed according to the
  4033 +| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4034 +*----------------------------------------------------------------------------*/
  4035 +
  4036 +flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
  4037 +{
  4038 + flag aSign, bSign;
  4039 +
  4040 + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  4041 + && (bits64) ( extractFloatx80Frac( a )<<1 ) )
  4042 + || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  4043 + && (bits64) ( extractFloatx80Frac( b )<<1 ) )
  4044 + ) {
  4045 + if ( floatx80_is_signaling_nan( a )
  4046 + || floatx80_is_signaling_nan( b ) ) {
  4047 + float_raise( float_flag_invalid STATUS_VAR);
  4048 + }
  4049 + return 0;
  4050 + }
  4051 + aSign = extractFloatx80Sign( a );
  4052 + bSign = extractFloatx80Sign( b );
  4053 + if ( aSign != bSign ) {
  4054 + return
  4055 + aSign
  4056 + && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  4057 + != 0 );
  4058 + }
  4059 + return
  4060 + aSign ? lt128( b.high, b.low, a.high, a.low )
  4061 + : lt128( a.high, a.low, b.high, b.low );
  4062 +
  4063 +}
  4064 +
  4065 +#endif
  4066 +
  4067 +#ifdef FLOAT128
  4068 +
  4069 +/*----------------------------------------------------------------------------
  4070 +| Returns the result of converting the quadruple-precision floating-point
  4071 +| value `a' to the 32-bit two's complement integer format. The conversion
  4072 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4073 +| Arithmetic---which means in particular that the conversion is rounded
  4074 +| according to the current rounding mode. If `a' is a NaN, the largest
  4075 +| positive integer is returned. Otherwise, if the conversion overflows, the
  4076 +| largest integer with the same sign as `a' is returned.
  4077 +*----------------------------------------------------------------------------*/
  4078 +
  4079 +int32 float128_to_int32( float128 a STATUS_PARAM )
  4080 +{
  4081 + flag aSign;
  4082 + int32 aExp, shiftCount;
  4083 + bits64 aSig0, aSig1;
  4084 +
  4085 + aSig1 = extractFloat128Frac1( a );
  4086 + aSig0 = extractFloat128Frac0( a );
  4087 + aExp = extractFloat128Exp( a );
  4088 + aSign = extractFloat128Sign( a );
  4089 + if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
  4090 + if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
  4091 + aSig0 |= ( aSig1 != 0 );
  4092 + shiftCount = 0x4028 - aExp;
  4093 + if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
  4094 + return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
  4095 +
  4096 +}
  4097 +
  4098 +/*----------------------------------------------------------------------------
  4099 +| Returns the result of converting the quadruple-precision floating-point
  4100 +| value `a' to the 32-bit two's complement integer format. The conversion
  4101 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4102 +| Arithmetic, except that the conversion is always rounded toward zero. If
  4103 +| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  4104 +| conversion overflows, the largest integer with the same sign as `a' is
  4105 +| returned.
  4106 +*----------------------------------------------------------------------------*/
  4107 +
  4108 +int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
  4109 +{
  4110 + flag aSign;
  4111 + int32 aExp, shiftCount;
  4112 + bits64 aSig0, aSig1, savedASig;
  4113 + int32 z;
  4114 +
  4115 + aSig1 = extractFloat128Frac1( a );
  4116 + aSig0 = extractFloat128Frac0( a );
  4117 + aExp = extractFloat128Exp( a );
  4118 + aSign = extractFloat128Sign( a );
  4119 + aSig0 |= ( aSig1 != 0 );
  4120 + if ( 0x401E < aExp ) {
  4121 + if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
  4122 + goto invalid;
  4123 + }
  4124 + else if ( aExp < 0x3FFF ) {
  4125 + if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
  4126 + return 0;
  4127 + }
  4128 + aSig0 |= LIT64( 0x0001000000000000 );
  4129 + shiftCount = 0x402F - aExp;
  4130 + savedASig = aSig0;
  4131 + aSig0 >>= shiftCount;
  4132 + z = aSig0;
  4133 + if ( aSign ) z = - z;
  4134 + if ( ( z < 0 ) ^ aSign ) {
  4135 + invalid:
  4136 + float_raise( float_flag_invalid STATUS_VAR);
  4137 + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
  4138 + }
  4139 + if ( ( aSig0<<shiftCount ) != savedASig ) {
  4140 + STATUS(float_exception_flags) |= float_flag_inexact;
  4141 + }
  4142 + return z;
  4143 +
  4144 +}
  4145 +
  4146 +/*----------------------------------------------------------------------------
  4147 +| Returns the result of converting the quadruple-precision floating-point
  4148 +| value `a' to the 64-bit two's complement integer format. The conversion
  4149 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4150 +| Arithmetic---which means in particular that the conversion is rounded
  4151 +| according to the current rounding mode. If `a' is a NaN, the largest
  4152 +| positive integer is returned. Otherwise, if the conversion overflows, the
  4153 +| largest integer with the same sign as `a' is returned.
  4154 +*----------------------------------------------------------------------------*/
  4155 +
  4156 +int64 float128_to_int64( float128 a STATUS_PARAM )
  4157 +{
  4158 + flag aSign;
  4159 + int32 aExp, shiftCount;
  4160 + bits64 aSig0, aSig1;
  4161 +
  4162 + aSig1 = extractFloat128Frac1( a );
  4163 + aSig0 = extractFloat128Frac0( a );
  4164 + aExp = extractFloat128Exp( a );
  4165 + aSign = extractFloat128Sign( a );
  4166 + if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
  4167 + shiftCount = 0x402F - aExp;
  4168 + if ( shiftCount <= 0 ) {
  4169 + if ( 0x403E < aExp ) {
  4170 + float_raise( float_flag_invalid STATUS_VAR);
  4171 + if ( ! aSign
  4172 + || ( ( aExp == 0x7FFF )
  4173 + && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
  4174 + )
  4175 + ) {
  4176 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  4177 + }
  4178 + return (sbits64) LIT64( 0x8000000000000000 );
  4179 + }
  4180 + shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
  4181 + }
  4182 + else {
  4183 + shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
  4184 + }
  4185 + return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
  4186 +
  4187 +}
  4188 +
  4189 +/*----------------------------------------------------------------------------
  4190 +| Returns the result of converting the quadruple-precision floating-point
  4191 +| value `a' to the 64-bit two's complement integer format. The conversion
  4192 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4193 +| Arithmetic, except that the conversion is always rounded toward zero.
  4194 +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  4195 +| the conversion overflows, the largest integer with the same sign as `a' is
  4196 +| returned.
  4197 +*----------------------------------------------------------------------------*/
  4198 +
  4199 +int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
  4200 +{
  4201 + flag aSign;
  4202 + int32 aExp, shiftCount;
  4203 + bits64 aSig0, aSig1;
  4204 + int64 z;
  4205 +
  4206 + aSig1 = extractFloat128Frac1( a );
  4207 + aSig0 = extractFloat128Frac0( a );
  4208 + aExp = extractFloat128Exp( a );
  4209 + aSign = extractFloat128Sign( a );
  4210 + if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
  4211 + shiftCount = aExp - 0x402F;
  4212 + if ( 0 < shiftCount ) {
  4213 + if ( 0x403E <= aExp ) {
  4214 + aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
  4215 + if ( ( a.high == LIT64( 0xC03E000000000000 ) )
  4216 + && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
  4217 + if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
  4218 + }
  4219 + else {
  4220 + float_raise( float_flag_invalid STATUS_VAR);
  4221 + if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
  4222 + return LIT64( 0x7FFFFFFFFFFFFFFF );
  4223 + }
  4224 + }
  4225 + return (sbits64) LIT64( 0x8000000000000000 );
  4226 + }
  4227 + z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
  4228 + if ( (bits64) ( aSig1<<shiftCount ) ) {
  4229 + STATUS(float_exception_flags) |= float_flag_inexact;
  4230 + }
  4231 + }
  4232 + else {
  4233 + if ( aExp < 0x3FFF ) {
  4234 + if ( aExp | aSig0 | aSig1 ) {
  4235 + STATUS(float_exception_flags) |= float_flag_inexact;
  4236 + }
  4237 + return 0;
  4238 + }
  4239 + z = aSig0>>( - shiftCount );
  4240 + if ( aSig1
  4241 + || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
  4242 + STATUS(float_exception_flags) |= float_flag_inexact;
  4243 + }
  4244 + }
  4245 + if ( aSign ) z = - z;
  4246 + return z;
  4247 +
  4248 +}
  4249 +
  4250 +/*----------------------------------------------------------------------------
  4251 +| Returns the result of converting the quadruple-precision floating-point
  4252 +| value `a' to the single-precision floating-point format. The conversion
  4253 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4254 +| Arithmetic.
  4255 +*----------------------------------------------------------------------------*/
  4256 +
  4257 +float32 float128_to_float32( float128 a STATUS_PARAM )
  4258 +{
  4259 + flag aSign;
  4260 + int32 aExp;
  4261 + bits64 aSig0, aSig1;
  4262 + bits32 zSig;
  4263 +
  4264 + aSig1 = extractFloat128Frac1( a );
  4265 + aSig0 = extractFloat128Frac0( a );
  4266 + aExp = extractFloat128Exp( a );
  4267 + aSign = extractFloat128Sign( a );
  4268 + if ( aExp == 0x7FFF ) {
  4269 + if ( aSig0 | aSig1 ) {
  4270 + return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
  4271 + }
  4272 + return packFloat32( aSign, 0xFF, 0 );
  4273 + }
  4274 + aSig0 |= ( aSig1 != 0 );
  4275 + shift64RightJamming( aSig0, 18, &aSig0 );
  4276 + zSig = aSig0;
  4277 + if ( aExp || zSig ) {
  4278 + zSig |= 0x40000000;
  4279 + aExp -= 0x3F81;
  4280 + }
  4281 + return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
  4282 +
  4283 +}
  4284 +
  4285 +/*----------------------------------------------------------------------------
  4286 +| Returns the result of converting the quadruple-precision floating-point
  4287 +| value `a' to the double-precision floating-point format. The conversion
  4288 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  4289 +| Arithmetic.
  4290 +*----------------------------------------------------------------------------*/
  4291 +
  4292 +float64 float128_to_float64( float128 a STATUS_PARAM )
  4293 +{
  4294 + flag aSign;
  4295 + int32 aExp;
  4296 + bits64 aSig0, aSig1;
  4297 +
  4298 + aSig1 = extractFloat128Frac1( a );
  4299 + aSig0 = extractFloat128Frac0( a );
  4300 + aExp = extractFloat128Exp( a );
  4301 + aSign = extractFloat128Sign( a );
  4302 + if ( aExp == 0x7FFF ) {
  4303 + if ( aSig0 | aSig1 ) {
  4304 + return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
  4305 + }
  4306 + return packFloat64( aSign, 0x7FF, 0 );
  4307 + }
  4308 + shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
  4309 + aSig0 |= ( aSig1 != 0 );
  4310 + if ( aExp || aSig0 ) {
  4311 + aSig0 |= LIT64( 0x4000000000000000 );
  4312 + aExp -= 0x3C01;
  4313 + }
  4314 + return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
  4315 +
  4316 +}
  4317 +
  4318 +#ifdef FLOATX80
  4319 +
  4320 +/*----------------------------------------------------------------------------
  4321 +| Returns the result of converting the quadruple-precision floating-point
  4322 +| value `a' to the extended double-precision floating-point format. The
  4323 +| conversion is performed according to the IEC/IEEE Standard for Binary
  4324 +| Floating-Point Arithmetic.
  4325 +*----------------------------------------------------------------------------*/
  4326 +
  4327 +floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
  4328 +{
  4329 + flag aSign;
  4330 + int32 aExp;
  4331 + bits64 aSig0, aSig1;
  4332 +
  4333 + aSig1 = extractFloat128Frac1( a );
  4334 + aSig0 = extractFloat128Frac0( a );
  4335 + aExp = extractFloat128Exp( a );
  4336 + aSign = extractFloat128Sign( a );
  4337 + if ( aExp == 0x7FFF ) {
  4338 + if ( aSig0 | aSig1 ) {
  4339 + return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
  4340 + }
  4341 + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  4342 + }
  4343 + if ( aExp == 0 ) {
  4344 + if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
  4345 + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  4346 + }
  4347 + else {
  4348 + aSig0 |= LIT64( 0x0001000000000000 );
  4349 + }
  4350 + shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
  4351 + return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
  4352 +
  4353 +}
  4354 +
  4355 +#endif
  4356 +
  4357 +/*----------------------------------------------------------------------------
  4358 +| Rounds the quadruple-precision floating-point value `a' to an integer, and
  4359 +| returns the result as a quadruple-precision floating-point value. The
  4360 +| operation is performed according to the IEC/IEEE Standard for Binary
  4361 +| Floating-Point Arithmetic.
  4362 +*----------------------------------------------------------------------------*/
  4363 +
  4364 +float128 float128_round_to_int( float128 a STATUS_PARAM )
  4365 +{
  4366 + flag aSign;
  4367 + int32 aExp;
  4368 + bits64 lastBitMask, roundBitsMask;
  4369 + int8 roundingMode;
  4370 + float128 z;
  4371 +
  4372 + aExp = extractFloat128Exp( a );
  4373 + if ( 0x402F <= aExp ) {
  4374 + if ( 0x406F <= aExp ) {
  4375 + if ( ( aExp == 0x7FFF )
  4376 + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
  4377 + ) {
  4378 + return propagateFloat128NaN( a, a STATUS_VAR );
  4379 + }
  4380 + return a;
  4381 + }
  4382 + lastBitMask = 1;
  4383 + lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
  4384 + roundBitsMask = lastBitMask - 1;
  4385 + z = a;
  4386 + roundingMode = STATUS(float_rounding_mode);
  4387 + if ( roundingMode == float_round_nearest_even ) {
  4388 + if ( lastBitMask ) {
  4389 + add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
  4390 + if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
  4391 + }
  4392 + else {
  4393 + if ( (sbits64) z.low < 0 ) {
  4394 + ++z.high;
  4395 + if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
  4396 + }
  4397 + }
  4398 + }
  4399 + else if ( roundingMode != float_round_to_zero ) {
  4400 + if ( extractFloat128Sign( z )
  4401 + ^ ( roundingMode == float_round_up ) ) {
  4402 + add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
  4403 + }
  4404 + }
  4405 + z.low &= ~ roundBitsMask;
  4406 + }
  4407 + else {
  4408 + if ( aExp < 0x3FFF ) {
  4409 + if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
  4410 + STATUS(float_exception_flags) |= float_flag_inexact;
  4411 + aSign = extractFloat128Sign( a );
  4412 + switch ( STATUS(float_rounding_mode) ) {
  4413 + case float_round_nearest_even:
  4414 + if ( ( aExp == 0x3FFE )
  4415 + && ( extractFloat128Frac0( a )
  4416 + | extractFloat128Frac1( a ) )
  4417 + ) {
  4418 + return packFloat128( aSign, 0x3FFF, 0, 0 );
  4419 + }
  4420 + break;
  4421 + case float_round_down:
  4422 + return
  4423 + aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
  4424 + : packFloat128( 0, 0, 0, 0 );
  4425 + case float_round_up:
  4426 + return
  4427 + aSign ? packFloat128( 1, 0, 0, 0 )
  4428 + : packFloat128( 0, 0x3FFF, 0, 0 );
  4429 + }
  4430 + return packFloat128( aSign, 0, 0, 0 );
  4431 + }
  4432 + lastBitMask = 1;
  4433 + lastBitMask <<= 0x402F - aExp;
  4434 + roundBitsMask = lastBitMask - 1;
  4435 + z.low = 0;
  4436 + z.high = a.high;
  4437 + roundingMode = STATUS(float_rounding_mode);
  4438 + if ( roundingMode == float_round_nearest_even ) {
  4439 + z.high += lastBitMask>>1;
  4440 + if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
  4441 + z.high &= ~ lastBitMask;
  4442 + }
  4443 + }
  4444 + else if ( roundingMode != float_round_to_zero ) {
  4445 + if ( extractFloat128Sign( z )
  4446 + ^ ( roundingMode == float_round_up ) ) {
  4447 + z.high |= ( a.low != 0 );
  4448 + z.high += roundBitsMask;
  4449 + }
  4450 + }
  4451 + z.high &= ~ roundBitsMask;
  4452 + }
  4453 + if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
  4454 + STATUS(float_exception_flags) |= float_flag_inexact;
  4455 + }
  4456 + return z;
  4457 +
  4458 +}
  4459 +
  4460 +/*----------------------------------------------------------------------------
  4461 +| Returns the result of adding the absolute values of the quadruple-precision
  4462 +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  4463 +| before being returned. `zSign' is ignored if the result is a NaN.
  4464 +| The addition is performed according to the IEC/IEEE Standard for Binary
  4465 +| Floating-Point Arithmetic.
  4466 +*----------------------------------------------------------------------------*/
  4467 +
  4468 +static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
  4469 +{
  4470 + int32 aExp, bExp, zExp;
  4471 + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
  4472 + int32 expDiff;
  4473 +
  4474 + aSig1 = extractFloat128Frac1( a );
  4475 + aSig0 = extractFloat128Frac0( a );
  4476 + aExp = extractFloat128Exp( a );
  4477 + bSig1 = extractFloat128Frac1( b );
  4478 + bSig0 = extractFloat128Frac0( b );
  4479 + bExp = extractFloat128Exp( b );
  4480 + expDiff = aExp - bExp;
  4481 + if ( 0 < expDiff ) {
  4482 + if ( aExp == 0x7FFF ) {
  4483 + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4484 + return a;
  4485 + }
  4486 + if ( bExp == 0 ) {
  4487 + --expDiff;
  4488 + }
  4489 + else {
  4490 + bSig0 |= LIT64( 0x0001000000000000 );
  4491 + }
  4492 + shift128ExtraRightJamming(
  4493 + bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
  4494 + zExp = aExp;
  4495 + }
  4496 + else if ( expDiff < 0 ) {
  4497 + if ( bExp == 0x7FFF ) {
  4498 + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4499 + return packFloat128( zSign, 0x7FFF, 0, 0 );
  4500 + }
  4501 + if ( aExp == 0 ) {
  4502 + ++expDiff;
  4503 + }
  4504 + else {
  4505 + aSig0 |= LIT64( 0x0001000000000000 );
  4506 + }
  4507 + shift128ExtraRightJamming(
  4508 + aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
  4509 + zExp = bExp;
  4510 + }
  4511 + else {
  4512 + if ( aExp == 0x7FFF ) {
  4513 + if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
  4514 + return propagateFloat128NaN( a, b STATUS_VAR );
  4515 + }
  4516 + return a;
  4517 + }
  4518 + add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  4519 + if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
  4520 + zSig2 = 0;
  4521 + zSig0 |= LIT64( 0x0002000000000000 );
  4522 + zExp = aExp;
  4523 + goto shiftRight1;
  4524 + }
  4525 + aSig0 |= LIT64( 0x0001000000000000 );
  4526 + add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  4527 + --zExp;
  4528 + if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
  4529 + ++zExp;
  4530 + shiftRight1:
  4531 + shift128ExtraRightJamming(
  4532 + zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  4533 + roundAndPack:
  4534 + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
  4535 +
  4536 +}
  4537 +
  4538 +/*----------------------------------------------------------------------------
  4539 +| Returns the result of subtracting the absolute values of the quadruple-
  4540 +| precision floating-point values `a' and `b'. If `zSign' is 1, the
  4541 +| difference is negated before being returned. `zSign' is ignored if the
  4542 +| result is a NaN. The subtraction is performed according to the IEC/IEEE
  4543 +| Standard for Binary Floating-Point Arithmetic.
  4544 +*----------------------------------------------------------------------------*/
  4545 +
  4546 +static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
  4547 +{
  4548 + int32 aExp, bExp, zExp;
  4549 + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
  4550 + int32 expDiff;
  4551 + float128 z;
  4552 +
  4553 + aSig1 = extractFloat128Frac1( a );
  4554 + aSig0 = extractFloat128Frac0( a );
  4555 + aExp = extractFloat128Exp( a );
  4556 + bSig1 = extractFloat128Frac1( b );
  4557 + bSig0 = extractFloat128Frac0( b );
  4558 + bExp = extractFloat128Exp( b );
  4559 + expDiff = aExp - bExp;
  4560 + shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
  4561 + shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
  4562 + if ( 0 < expDiff ) goto aExpBigger;
  4563 + if ( expDiff < 0 ) goto bExpBigger;
  4564 + if ( aExp == 0x7FFF ) {
  4565 + if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
  4566 + return propagateFloat128NaN( a, b STATUS_VAR );
  4567 + }
  4568 + float_raise( float_flag_invalid STATUS_VAR);
  4569 + z.low = float128_default_nan_low;
  4570 + z.high = float128_default_nan_high;
  4571 + return z;
  4572 + }
  4573 + if ( aExp == 0 ) {
  4574 + aExp = 1;
  4575 + bExp = 1;
  4576 + }
  4577 + if ( bSig0 < aSig0 ) goto aBigger;
  4578 + if ( aSig0 < bSig0 ) goto bBigger;
  4579 + if ( bSig1 < aSig1 ) goto aBigger;
  4580 + if ( aSig1 < bSig1 ) goto bBigger;
  4581 + return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
  4582 + bExpBigger:
  4583 + if ( bExp == 0x7FFF ) {
  4584 + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4585 + return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
  4586 + }
  4587 + if ( aExp == 0 ) {
  4588 + ++expDiff;
  4589 + }
  4590 + else {
  4591 + aSig0 |= LIT64( 0x4000000000000000 );
  4592 + }
  4593 + shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  4594 + bSig0 |= LIT64( 0x4000000000000000 );
  4595 + bBigger:
  4596 + sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
  4597 + zExp = bExp;
  4598 + zSign ^= 1;
  4599 + goto normalizeRoundAndPack;
  4600 + aExpBigger:
  4601 + if ( aExp == 0x7FFF ) {
  4602 + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4603 + return a;
  4604 + }
  4605 + if ( bExp == 0 ) {
  4606 + --expDiff;
  4607 + }
  4608 + else {
  4609 + bSig0 |= LIT64( 0x4000000000000000 );
  4610 + }
  4611 + shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
  4612 + aSig0 |= LIT64( 0x4000000000000000 );
  4613 + aBigger:
  4614 + sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  4615 + zExp = aExp;
  4616 + normalizeRoundAndPack:
  4617 + --zExp;
  4618 + return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
  4619 +
  4620 +}
  4621 +
  4622 +/*----------------------------------------------------------------------------
  4623 +| Returns the result of adding the quadruple-precision floating-point values
  4624 +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  4625 +| for Binary Floating-Point Arithmetic.
  4626 +*----------------------------------------------------------------------------*/
  4627 +
  4628 +float128 float128_add( float128 a, float128 b STATUS_PARAM )
  4629 +{
  4630 + flag aSign, bSign;
  4631 +
  4632 + aSign = extractFloat128Sign( a );
  4633 + bSign = extractFloat128Sign( b );
  4634 + if ( aSign == bSign ) {
  4635 + return addFloat128Sigs( a, b, aSign STATUS_VAR );
  4636 + }
  4637 + else {
  4638 + return subFloat128Sigs( a, b, aSign STATUS_VAR );
  4639 + }
  4640 +
  4641 +}
  4642 +
  4643 +/*----------------------------------------------------------------------------
  4644 +| Returns the result of subtracting the quadruple-precision floating-point
  4645 +| values `a' and `b'. The operation is performed according to the IEC/IEEE
  4646 +| Standard for Binary Floating-Point Arithmetic.
  4647 +*----------------------------------------------------------------------------*/
  4648 +
  4649 +float128 float128_sub( float128 a, float128 b STATUS_PARAM )
  4650 +{
  4651 + flag aSign, bSign;
  4652 +
  4653 + aSign = extractFloat128Sign( a );
  4654 + bSign = extractFloat128Sign( b );
  4655 + if ( aSign == bSign ) {
  4656 + return subFloat128Sigs( a, b, aSign STATUS_VAR );
  4657 + }
  4658 + else {
  4659 + return addFloat128Sigs( a, b, aSign STATUS_VAR );
  4660 + }
  4661 +
  4662 +}
  4663 +
  4664 +/*----------------------------------------------------------------------------
  4665 +| Returns the result of multiplying the quadruple-precision floating-point
  4666 +| values `a' and `b'. The operation is performed according to the IEC/IEEE
  4667 +| Standard for Binary Floating-Point Arithmetic.
  4668 +*----------------------------------------------------------------------------*/
  4669 +
  4670 +float128 float128_mul( float128 a, float128 b STATUS_PARAM )
  4671 +{
  4672 + flag aSign, bSign, zSign;
  4673 + int32 aExp, bExp, zExp;
  4674 + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
  4675 + float128 z;
  4676 +
  4677 + aSig1 = extractFloat128Frac1( a );
  4678 + aSig0 = extractFloat128Frac0( a );
  4679 + aExp = extractFloat128Exp( a );
  4680 + aSign = extractFloat128Sign( a );
  4681 + bSig1 = extractFloat128Frac1( b );
  4682 + bSig0 = extractFloat128Frac0( b );
  4683 + bExp = extractFloat128Exp( b );
  4684 + bSign = extractFloat128Sign( b );
  4685 + zSign = aSign ^ bSign;
  4686 + if ( aExp == 0x7FFF ) {
  4687 + if ( ( aSig0 | aSig1 )
  4688 + || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
  4689 + return propagateFloat128NaN( a, b STATUS_VAR );
  4690 + }
  4691 + if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
  4692 + return packFloat128( zSign, 0x7FFF, 0, 0 );
  4693 + }
  4694 + if ( bExp == 0x7FFF ) {
  4695 + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4696 + if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
  4697 + invalid:
  4698 + float_raise( float_flag_invalid STATUS_VAR);
  4699 + z.low = float128_default_nan_low;
  4700 + z.high = float128_default_nan_high;
  4701 + return z;
  4702 + }
  4703 + return packFloat128( zSign, 0x7FFF, 0, 0 );
  4704 + }
  4705 + if ( aExp == 0 ) {
  4706 + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
  4707 + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  4708 + }
  4709 + if ( bExp == 0 ) {
  4710 + if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
  4711 + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  4712 + }
  4713 + zExp = aExp + bExp - 0x4000;
  4714 + aSig0 |= LIT64( 0x0001000000000000 );
  4715 + shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
  4716 + mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
  4717 + add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
  4718 + zSig2 |= ( zSig3 != 0 );
  4719 + if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
  4720 + shift128ExtraRightJamming(
  4721 + zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  4722 + ++zExp;
  4723 + }
  4724 + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
  4725 +
  4726 +}
  4727 +
  4728 +/*----------------------------------------------------------------------------
  4729 +| Returns the result of dividing the quadruple-precision floating-point value
  4730 +| `a' by the corresponding value `b'. The operation is performed according to
  4731 +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4732 +*----------------------------------------------------------------------------*/
  4733 +
  4734 +float128 float128_div( float128 a, float128 b STATUS_PARAM )
  4735 +{
  4736 + flag aSign, bSign, zSign;
  4737 + int32 aExp, bExp, zExp;
  4738 + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
  4739 + bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
  4740 + float128 z;
  4741 +
  4742 + aSig1 = extractFloat128Frac1( a );
  4743 + aSig0 = extractFloat128Frac0( a );
  4744 + aExp = extractFloat128Exp( a );
  4745 + aSign = extractFloat128Sign( a );
  4746 + bSig1 = extractFloat128Frac1( b );
  4747 + bSig0 = extractFloat128Frac0( b );
  4748 + bExp = extractFloat128Exp( b );
  4749 + bSign = extractFloat128Sign( b );
  4750 + zSign = aSign ^ bSign;
  4751 + if ( aExp == 0x7FFF ) {
  4752 + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4753 + if ( bExp == 0x7FFF ) {
  4754 + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4755 + goto invalid;
  4756 + }
  4757 + return packFloat128( zSign, 0x7FFF, 0, 0 );
  4758 + }
  4759 + if ( bExp == 0x7FFF ) {
  4760 + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4761 + return packFloat128( zSign, 0, 0, 0 );
  4762 + }
  4763 + if ( bExp == 0 ) {
  4764 + if ( ( bSig0 | bSig1 ) == 0 ) {
  4765 + if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
  4766 + invalid:
  4767 + float_raise( float_flag_invalid STATUS_VAR);
  4768 + z.low = float128_default_nan_low;
  4769 + z.high = float128_default_nan_high;
  4770 + return z;
  4771 + }
  4772 + float_raise( float_flag_divbyzero STATUS_VAR);
  4773 + return packFloat128( zSign, 0x7FFF, 0, 0 );
  4774 + }
  4775 + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  4776 + }
  4777 + if ( aExp == 0 ) {
  4778 + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
  4779 + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  4780 + }
  4781 + zExp = aExp - bExp + 0x3FFD;
  4782 + shortShift128Left(
  4783 + aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
  4784 + shortShift128Left(
  4785 + bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  4786 + if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
  4787 + shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
  4788 + ++zExp;
  4789 + }
  4790 + zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
  4791 + mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
  4792 + sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
  4793 + while ( (sbits64) rem0 < 0 ) {
  4794 + --zSig0;
  4795 + add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
  4796 + }
  4797 + zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
  4798 + if ( ( zSig1 & 0x3FFF ) <= 4 ) {
  4799 + mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
  4800 + sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
  4801 + while ( (sbits64) rem1 < 0 ) {
  4802 + --zSig1;
  4803 + add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
  4804 + }
  4805 + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
  4806 + }
  4807 + shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
  4808 + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
  4809 +
  4810 +}
  4811 +
  4812 +/*----------------------------------------------------------------------------
  4813 +| Returns the remainder of the quadruple-precision floating-point value `a'
  4814 +| with respect to the corresponding value `b'. The operation is performed
  4815 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4816 +*----------------------------------------------------------------------------*/
  4817 +
  4818 +float128 float128_rem( float128 a, float128 b STATUS_PARAM )
  4819 +{
  4820 + flag aSign, bSign, zSign;
  4821 + int32 aExp, bExp, expDiff;
  4822 + bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
  4823 + bits64 allZero, alternateASig0, alternateASig1, sigMean1;
  4824 + sbits64 sigMean0;
  4825 + float128 z;
  4826 +
  4827 + aSig1 = extractFloat128Frac1( a );
  4828 + aSig0 = extractFloat128Frac0( a );
  4829 + aExp = extractFloat128Exp( a );
  4830 + aSign = extractFloat128Sign( a );
  4831 + bSig1 = extractFloat128Frac1( b );
  4832 + bSig0 = extractFloat128Frac0( b );
  4833 + bExp = extractFloat128Exp( b );
  4834 + bSign = extractFloat128Sign( b );
  4835 + if ( aExp == 0x7FFF ) {
  4836 + if ( ( aSig0 | aSig1 )
  4837 + || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
  4838 + return propagateFloat128NaN( a, b STATUS_VAR );
  4839 + }
  4840 + goto invalid;
  4841 + }
  4842 + if ( bExp == 0x7FFF ) {
  4843 + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
  4844 + return a;
  4845 + }
  4846 + if ( bExp == 0 ) {
  4847 + if ( ( bSig0 | bSig1 ) == 0 ) {
  4848 + invalid:
  4849 + float_raise( float_flag_invalid STATUS_VAR);
  4850 + z.low = float128_default_nan_low;
  4851 + z.high = float128_default_nan_high;
  4852 + return z;
  4853 + }
  4854 + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  4855 + }
  4856 + if ( aExp == 0 ) {
  4857 + if ( ( aSig0 | aSig1 ) == 0 ) return a;
  4858 + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  4859 + }
  4860 + expDiff = aExp - bExp;
  4861 + if ( expDiff < -1 ) return a;
  4862 + shortShift128Left(
  4863 + aSig0 | LIT64( 0x0001000000000000 ),
  4864 + aSig1,
  4865 + 15 - ( expDiff < 0 ),
  4866 + &aSig0,
  4867 + &aSig1
  4868 + );
  4869 + shortShift128Left(
  4870 + bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  4871 + q = le128( bSig0, bSig1, aSig0, aSig1 );
  4872 + if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  4873 + expDiff -= 64;
  4874 + while ( 0 < expDiff ) {
  4875 + q = estimateDiv128To64( aSig0, aSig1, bSig0 );
  4876 + q = ( 4 < q ) ? q - 4 : 0;
  4877 + mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  4878 + shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
  4879 + shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
  4880 + sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
  4881 + expDiff -= 61;
  4882 + }
  4883 + if ( -64 < expDiff ) {
  4884 + q = estimateDiv128To64( aSig0, aSig1, bSig0 );
  4885 + q = ( 4 < q ) ? q - 4 : 0;
  4886 + q >>= - expDiff;
  4887 + shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  4888 + expDiff += 52;
  4889 + if ( expDiff < 0 ) {
  4890 + shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  4891 + }
  4892 + else {
  4893 + shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
  4894 + }
  4895 + mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  4896 + sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
  4897 + }
  4898 + else {
  4899 + shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
  4900 + shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  4901 + }
  4902 + do {
  4903 + alternateASig0 = aSig0;
  4904 + alternateASig1 = aSig1;
  4905 + ++q;
  4906 + sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  4907 + } while ( 0 <= (sbits64) aSig0 );
  4908 + add128(
  4909 + aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
  4910 + if ( ( sigMean0 < 0 )
  4911 + || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
  4912 + aSig0 = alternateASig0;
  4913 + aSig1 = alternateASig1;
  4914 + }
  4915 + zSign = ( (sbits64) aSig0 < 0 );
  4916 + if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
  4917 + return
  4918 + normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
  4919 +
  4920 +}
  4921 +
  4922 +/*----------------------------------------------------------------------------
  4923 +| Returns the square root of the quadruple-precision floating-point value `a'.
  4924 +| The operation is performed according to the IEC/IEEE Standard for Binary
  4925 +| Floating-Point Arithmetic.
  4926 +*----------------------------------------------------------------------------*/
  4927 +
  4928 +float128 float128_sqrt( float128 a STATUS_PARAM )
  4929 +{
  4930 + flag aSign;
  4931 + int32 aExp, zExp;
  4932 + bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
  4933 + bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
  4934 + float128 z;
  4935 +
  4936 + aSig1 = extractFloat128Frac1( a );
  4937 + aSig0 = extractFloat128Frac0( a );
  4938 + aExp = extractFloat128Exp( a );
  4939 + aSign = extractFloat128Sign( a );
  4940 + if ( aExp == 0x7FFF ) {
  4941 + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
  4942 + if ( ! aSign ) return a;
  4943 + goto invalid;
  4944 + }
  4945 + if ( aSign ) {
  4946 + if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
  4947 + invalid:
  4948 + float_raise( float_flag_invalid STATUS_VAR);
  4949 + z.low = float128_default_nan_low;
  4950 + z.high = float128_default_nan_high;
  4951 + return z;
  4952 + }
  4953 + if ( aExp == 0 ) {
  4954 + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
  4955 + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  4956 + }
  4957 + zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
  4958 + aSig0 |= LIT64( 0x0001000000000000 );
  4959 + zSig0 = estimateSqrt32( aExp, aSig0>>17 );
  4960 + shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
  4961 + zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
  4962 + doubleZSig0 = zSig0<<1;
  4963 + mul64To128( zSig0, zSig0, &term0, &term1 );
  4964 + sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  4965 + while ( (sbits64) rem0 < 0 ) {
  4966 + --zSig0;
  4967 + doubleZSig0 -= 2;
  4968 + add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
  4969 + }
  4970 + zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
  4971 + if ( ( zSig1 & 0x1FFF ) <= 5 ) {
  4972 + if ( zSig1 == 0 ) zSig1 = 1;
  4973 + mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  4974 + sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  4975 + mul64To128( zSig1, zSig1, &term2, &term3 );
  4976 + sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  4977 + while ( (sbits64) rem1 < 0 ) {
  4978 + --zSig1;
  4979 + shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  4980 + term3 |= 1;
  4981 + term2 |= doubleZSig0;
  4982 + add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  4983 + }
  4984 + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
  4985 + }
  4986 + shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
  4987 + return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
  4988 +
  4989 +}
  4990 +
  4991 +/*----------------------------------------------------------------------------
  4992 +| Returns 1 if the quadruple-precision floating-point value `a' is equal to
  4993 +| the corresponding value `b', and 0 otherwise. The comparison is performed
  4994 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4995 +*----------------------------------------------------------------------------*/
  4996 +
  4997 +flag float128_eq( float128 a, float128 b STATUS_PARAM )
  4998 +{
  4999 +
  5000 + if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  5001 + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  5002 + || ( ( extractFloat128Exp( b ) == 0x7FFF )
  5003 + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  5004 + ) {
  5005 + if ( float128_is_signaling_nan( a )
  5006 + || float128_is_signaling_nan( b ) ) {
  5007 + float_raise( float_flag_invalid STATUS_VAR);
  5008 + }
  5009 + return 0;
  5010 + }
  5011 + return
  5012 + ( a.low == b.low )
  5013 + && ( ( a.high == b.high )
  5014 + || ( ( a.low == 0 )
  5015 + && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
  5016 + );
  5017 +
  5018 +}
  5019 +
  5020 +/*----------------------------------------------------------------------------
  5021 +| Returns 1 if the quadruple-precision floating-point value `a' is less than
  5022 +| or equal to the corresponding value `b', and 0 otherwise. The comparison
  5023 +| is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5024 +| Arithmetic.
  5025 +*----------------------------------------------------------------------------*/
  5026 +
  5027 +flag float128_le( float128 a, float128 b STATUS_PARAM )
  5028 +{
  5029 + flag aSign, bSign;
  5030 +
  5031 + if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  5032 + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  5033 + || ( ( extractFloat128Exp( b ) == 0x7FFF )
  5034 + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  5035 + ) {
  5036 + float_raise( float_flag_invalid STATUS_VAR);
  5037 + return 0;
  5038 + }
  5039 + aSign = extractFloat128Sign( a );
  5040 + bSign = extractFloat128Sign( b );
  5041 + if ( aSign != bSign ) {
  5042 + return
  5043 + aSign
  5044 + || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5045 + == 0 );
  5046 + }
  5047 + return
  5048 + aSign ? le128( b.high, b.low, a.high, a.low )
  5049 + : le128( a.high, a.low, b.high, b.low );
  5050 +
  5051 +}
  5052 +
  5053 +/*----------------------------------------------------------------------------
  5054 +| Returns 1 if the quadruple-precision floating-point value `a' is less than
  5055 +| the corresponding value `b', and 0 otherwise. The comparison is performed
  5056 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5057 +*----------------------------------------------------------------------------*/
  5058 +
  5059 +flag float128_lt( float128 a, float128 b STATUS_PARAM )
  5060 +{
  5061 + flag aSign, bSign;
  5062 +
  5063 + if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  5064 + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  5065 + || ( ( extractFloat128Exp( b ) == 0x7FFF )
  5066 + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  5067 + ) {
  5068 + float_raise( float_flag_invalid STATUS_VAR);
  5069 + return 0;
  5070 + }
  5071 + aSign = extractFloat128Sign( a );
  5072 + bSign = extractFloat128Sign( b );
  5073 + if ( aSign != bSign ) {
  5074 + return
  5075 + aSign
  5076 + && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5077 + != 0 );
  5078 + }
  5079 + return
  5080 + aSign ? lt128( b.high, b.low, a.high, a.low )
  5081 + : lt128( a.high, a.low, b.high, b.low );
  5082 +
  5083 +}
  5084 +
  5085 +/*----------------------------------------------------------------------------
  5086 +| Returns 1 if the quadruple-precision floating-point value `a' is equal to
  5087 +| the corresponding value `b', and 0 otherwise. The invalid exception is
  5088 +| raised if either operand is a NaN. Otherwise, the comparison is performed
  5089 +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5090 +*----------------------------------------------------------------------------*/
  5091 +
  5092 +flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
  5093 +{
  5094 +
  5095 + if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  5096 + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  5097 + || ( ( extractFloat128Exp( b ) == 0x7FFF )
  5098 + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  5099 + ) {
  5100 + float_raise( float_flag_invalid STATUS_VAR);
  5101 + return 0;
  5102 + }
  5103 + return
  5104 + ( a.low == b.low )
  5105 + && ( ( a.high == b.high )
  5106 + || ( ( a.low == 0 )
  5107 + && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
  5108 + );
  5109 +
  5110 +}
  5111 +
  5112 +/*----------------------------------------------------------------------------
  5113 +| Returns 1 if the quadruple-precision floating-point value `a' is less than
  5114 +| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5115 +| cause an exception. Otherwise, the comparison is performed according to the
  5116 +| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5117 +*----------------------------------------------------------------------------*/
  5118 +
  5119 +flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
  5120 +{
  5121 + flag aSign, bSign;
  5122 +
  5123 + if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  5124 + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  5125 + || ( ( extractFloat128Exp( b ) == 0x7FFF )
  5126 + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  5127 + ) {
  5128 + if ( float128_is_signaling_nan( a )
  5129 + || float128_is_signaling_nan( b ) ) {
  5130 + float_raise( float_flag_invalid STATUS_VAR);
  5131 + }
  5132 + return 0;
  5133 + }
  5134 + aSign = extractFloat128Sign( a );
  5135 + bSign = extractFloat128Sign( b );
  5136 + if ( aSign != bSign ) {
  5137 + return
  5138 + aSign
  5139 + || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5140 + == 0 );
  5141 + }
  5142 + return
  5143 + aSign ? le128( b.high, b.low, a.high, a.low )
  5144 + : le128( a.high, a.low, b.high, b.low );
  5145 +
  5146 +}
  5147 +
  5148 +/*----------------------------------------------------------------------------
  5149 +| Returns 1 if the quadruple-precision floating-point value `a' is less than
  5150 +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  5151 +| exception. Otherwise, the comparison is performed according to the IEC/IEEE
  5152 +| Standard for Binary Floating-Point Arithmetic.
  5153 +*----------------------------------------------------------------------------*/
  5154 +
  5155 +flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
  5156 +{
  5157 + flag aSign, bSign;
  5158 +
  5159 + if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  5160 + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  5161 + || ( ( extractFloat128Exp( b ) == 0x7FFF )
  5162 + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  5163 + ) {
  5164 + if ( float128_is_signaling_nan( a )
  5165 + || float128_is_signaling_nan( b ) ) {
  5166 + float_raise( float_flag_invalid STATUS_VAR);
  5167 + }
  5168 + return 0;
  5169 + }
  5170 + aSign = extractFloat128Sign( a );
  5171 + bSign = extractFloat128Sign( b );
  5172 + if ( aSign != bSign ) {
  5173 + return
  5174 + aSign
  5175 + && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5176 + != 0 );
  5177 + }
  5178 + return
  5179 + aSign ? lt128( b.high, b.low, a.high, a.low )
  5180 + : lt128( a.high, a.low, b.high, b.low );
  5181 +
  5182 +}
  5183 +
  5184 +#endif
  5185 +
... ...
fpu/softfloat.h 0 → 100644
  1 +/*============================================================================
  2 +
  3 +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  4 +Package, Release 2b.
  5 +
  6 +Written by John R. Hauser. This work was made possible in part by the
  7 +International Computer Science Institute, located at Suite 600, 1947 Center
  8 +Street, Berkeley, California 94704. Funding was partially provided by the
  9 +National Science Foundation under grant MIP-9311980. The original version
  10 +of this code was written as part of a project to build a fixed-point vector
  11 +processor in collaboration with the University of California at Berkeley,
  12 +overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  13 +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  14 +arithmetic/SoftFloat.html'.
  15 +
  16 +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
  17 +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  18 +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  19 +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  20 +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  21 +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  22 +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  23 +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  24 +
  25 +Derivative works are acceptable, even for commercial purposes, so long as
  26 +(1) the source code for the derivative work includes prominent notice that
  27 +the work is derivative, and (2) the source code includes prominent notice with
  28 +these four paragraphs for those parts of this code that are retained.
  29 +
  30 +=============================================================================*/
  31 +
  32 +#ifndef SOFTFLOAT_H
  33 +#define SOFTFLOAT_H
  34 +
  35 +#include <inttypes.h>
  36 +#include "config.h"
  37 +
  38 +/*----------------------------------------------------------------------------
  39 +| Each of the following `typedef's defines the most convenient type that holds
  40 +| integers of at least as many bits as specified. For example, `uint8' should
  41 +| be the most convenient type that can hold unsigned integers of as many as
  42 +| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most
  43 +| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed
  44 +| to the same as `int'.
  45 +*----------------------------------------------------------------------------*/
  46 +typedef char flag;
  47 +typedef uint8_t uint8;
  48 +typedef int8_t int8;
  49 +typedef int uint16;
  50 +typedef int int16;
  51 +typedef unsigned int uint32;
  52 +typedef signed int int32;
  53 +typedef uint64_t uint64;
  54 +typedef int64_t int64;
  55 +
  56 +/*----------------------------------------------------------------------------
  57 +| Each of the following `typedef's defines a type that holds integers
  58 +| of _exactly_ the number of bits specified. For instance, for most
  59 +| implementation of C, `bits16' and `sbits16' should be `typedef'ed to
  60 +| `unsigned short int' and `signed short int' (or `short int'), respectively.
  61 +*----------------------------------------------------------------------------*/
  62 +typedef uint8_t bits8;
  63 +typedef int8_t sbits8;
  64 +typedef uint16_t bits16;
  65 +typedef int16_t sbits16;
  66 +typedef uint32_t bits32;
  67 +typedef int32_t sbits32;
  68 +typedef uint64_t bits64;
  69 +typedef int64_t sbits64;
  70 +
  71 +#define LIT64( a ) a##LL
  72 +#define INLINE static inline
  73 +
  74 +/*----------------------------------------------------------------------------
  75 +| The macro `FLOATX80' must be defined to enable the extended double-precision
  76 +| floating-point format `floatx80'. If this macro is not defined, the
  77 +| `floatx80' type will not be defined, and none of the functions that either
  78 +| input or output the `floatx80' type will be defined. The same applies to
  79 +| the `FLOAT128' macro and the quadruple-precision format `float128'.
  80 +*----------------------------------------------------------------------------*/
  81 +#ifdef CONFIG_SOFTFLOAT
  82 +/* bit exact soft float support */
  83 +#define FLOATX80
  84 +#define FLOAT128
  85 +#else
  86 +/* native float support */
  87 +#if (defined(__i386__) || defined(__x86_64__)) && !defined(_BSD)
  88 +#define FLOATX80
  89 +#endif
  90 +#endif /* !CONFIG_SOFTFLOAT */
  91 +
  92 +#define STATUS_PARAM , float_status *status
  93 +#define STATUS(field) status->field
  94 +#define STATUS_VAR , status
  95 +
  96 +#ifdef CONFIG_SOFTFLOAT
  97 +/*----------------------------------------------------------------------------
  98 +| Software IEC/IEEE floating-point types.
  99 +*----------------------------------------------------------------------------*/
  100 +typedef uint32_t float32;
  101 +typedef uint64_t float64;
  102 +#ifdef FLOATX80
  103 +typedef struct {
  104 + uint64_t low;
  105 + uint16_t high;
  106 +} floatx80;
  107 +#endif
  108 +#ifdef FLOAT128
  109 +typedef struct {
  110 +#ifdef WORDS_BIGENDIAN
  111 + uint64_t high, low;
  112 +#else
  113 + uint64_t low, high;
  114 +#endif
  115 +} float128;
  116 +#endif
  117 +
  118 +/*----------------------------------------------------------------------------
  119 +| Software IEC/IEEE floating-point underflow tininess-detection mode.
  120 +*----------------------------------------------------------------------------*/
  121 +enum {
  122 + float_tininess_after_rounding = 0,
  123 + float_tininess_before_rounding = 1
  124 +};
  125 +
  126 +/*----------------------------------------------------------------------------
  127 +| Software IEC/IEEE floating-point rounding mode.
  128 +*----------------------------------------------------------------------------*/
  129 +enum {
  130 + float_round_nearest_even = 0,
  131 + float_round_down = 1,
  132 + float_round_up = 2,
  133 + float_round_to_zero = 3
  134 +};
  135 +
  136 +/*----------------------------------------------------------------------------
  137 +| Software IEC/IEEE floating-point exception flags.
  138 +*----------------------------------------------------------------------------*/
  139 +enum {
  140 + float_flag_invalid = 1,
  141 + float_flag_divbyzero = 4,
  142 + float_flag_overflow = 8,
  143 + float_flag_underflow = 16,
  144 + float_flag_inexact = 32
  145 +};
  146 +
  147 +typedef struct float_status {
  148 + signed char float_detect_tininess;
  149 + signed char float_rounding_mode;
  150 + signed char float_exception_flags;
  151 +#ifdef FLOATX80
  152 + signed char floatx80_rounding_precision;
  153 +#endif
  154 +} float_status;
  155 +
  156 +void set_float_rounding_mode(int val STATUS_PARAM);
  157 +#ifdef FLOATX80
  158 +void set_floatx80_rounding_precision(int val STATUS_PARAM);
  159 +#endif
  160 +
  161 +/*----------------------------------------------------------------------------
  162 +| Routine to raise any or all of the software IEC/IEEE floating-point
  163 +| exception flags.
  164 +*----------------------------------------------------------------------------*/
  165 +void float_raise( signed char STATUS_PARAM);
  166 +
  167 +/*----------------------------------------------------------------------------
  168 +| Software IEC/IEEE integer-to-floating-point conversion routines.
  169 +*----------------------------------------------------------------------------*/
  170 +float32 int32_to_float32( int STATUS_PARAM );
  171 +float64 int32_to_float64( int STATUS_PARAM );
  172 +#ifdef FLOATX80
  173 +floatx80 int32_to_floatx80( int STATUS_PARAM );
  174 +#endif
  175 +#ifdef FLOAT128
  176 +float128 int32_to_float128( int STATUS_PARAM );
  177 +#endif
  178 +float32 int64_to_float32( int64_t STATUS_PARAM );
  179 +float64 int64_to_float64( int64_t STATUS_PARAM );
  180 +#ifdef FLOATX80
  181 +floatx80 int64_to_floatx80( int64_t STATUS_PARAM );
  182 +#endif
  183 +#ifdef FLOAT128
  184 +float128 int64_to_float128( int64_t STATUS_PARAM );
  185 +#endif
  186 +
  187 +/*----------------------------------------------------------------------------
  188 +| Software IEC/IEEE single-precision conversion routines.
  189 +*----------------------------------------------------------------------------*/
  190 +int float32_to_int32( float32 STATUS_PARAM );
  191 +int float32_to_int32_round_to_zero( float32 STATUS_PARAM );
  192 +int64_t float32_to_int64( float32 STATUS_PARAM );
  193 +int64_t float32_to_int64_round_to_zero( float32 STATUS_PARAM );
  194 +float64 float32_to_float64( float32 STATUS_PARAM );
  195 +#ifdef FLOATX80
  196 +floatx80 float32_to_floatx80( float32 STATUS_PARAM );
  197 +#endif
  198 +#ifdef FLOAT128
  199 +float128 float32_to_float128( float32 STATUS_PARAM );
  200 +#endif
  201 +
  202 +/*----------------------------------------------------------------------------
  203 +| Software IEC/IEEE single-precision operations.
  204 +*----------------------------------------------------------------------------*/
  205 +float32 float32_round_to_int( float32 STATUS_PARAM );
  206 +float32 float32_add( float32, float32 STATUS_PARAM );
  207 +float32 float32_sub( float32, float32 STATUS_PARAM );
  208 +float32 float32_mul( float32, float32 STATUS_PARAM );
  209 +float32 float32_div( float32, float32 STATUS_PARAM );
  210 +float32 float32_rem( float32, float32 STATUS_PARAM );
  211 +float32 float32_sqrt( float32 STATUS_PARAM );
  212 +char float32_eq( float32, float32 STATUS_PARAM );
  213 +char float32_le( float32, float32 STATUS_PARAM );
  214 +char float32_lt( float32, float32 STATUS_PARAM );
  215 +char float32_eq_signaling( float32, float32 STATUS_PARAM );
  216 +char float32_le_quiet( float32, float32 STATUS_PARAM );
  217 +char float32_lt_quiet( float32, float32 STATUS_PARAM );
  218 +char float32_is_signaling_nan( float32 );
  219 +
  220 +/*----------------------------------------------------------------------------
  221 +| Software IEC/IEEE double-precision conversion routines.
  222 +*----------------------------------------------------------------------------*/
  223 +int float64_to_int32( float64 STATUS_PARAM );
  224 +int float64_to_int32_round_to_zero( float64 STATUS_PARAM );
  225 +int64_t float64_to_int64( float64 STATUS_PARAM );
  226 +int64_t float64_to_int64_round_to_zero( float64 STATUS_PARAM );
  227 +float32 float64_to_float32( float64 STATUS_PARAM );
  228 +#ifdef FLOATX80
  229 +floatx80 float64_to_floatx80( float64 STATUS_PARAM );
  230 +#endif
  231 +#ifdef FLOAT128
  232 +float128 float64_to_float128( float64 STATUS_PARAM );
  233 +#endif
  234 +
  235 +/*----------------------------------------------------------------------------
  236 +| Software IEC/IEEE double-precision operations.
  237 +*----------------------------------------------------------------------------*/
  238 +float64 float64_round_to_int( float64 STATUS_PARAM );
  239 +float64 float64_add( float64, float64 STATUS_PARAM );
  240 +float64 float64_sub( float64, float64 STATUS_PARAM );
  241 +float64 float64_mul( float64, float64 STATUS_PARAM );
  242 +float64 float64_div( float64, float64 STATUS_PARAM );
  243 +float64 float64_rem( float64, float64 STATUS_PARAM );
  244 +float64 float64_sqrt( float64 STATUS_PARAM );
  245 +char float64_eq( float64, float64 STATUS_PARAM );
  246 +char float64_le( float64, float64 STATUS_PARAM );
  247 +char float64_lt( float64, float64 STATUS_PARAM );
  248 +char float64_eq_signaling( float64, float64 STATUS_PARAM );
  249 +char float64_le_quiet( float64, float64 STATUS_PARAM );
  250 +char float64_lt_quiet( float64, float64 STATUS_PARAM );
  251 +char float64_is_signaling_nan( float64 );
  252 +
  253 +#ifdef FLOATX80
  254 +
  255 +/*----------------------------------------------------------------------------
  256 +| Software IEC/IEEE extended double-precision conversion routines.
  257 +*----------------------------------------------------------------------------*/
  258 +int floatx80_to_int32( floatx80 STATUS_PARAM );
  259 +int floatx80_to_int32_round_to_zero( floatx80 STATUS_PARAM );
  260 +int64_t floatx80_to_int64( floatx80 STATUS_PARAM );
  261 +int64_t floatx80_to_int64_round_to_zero( floatx80 STATUS_PARAM );
  262 +float32 floatx80_to_float32( floatx80 STATUS_PARAM );
  263 +float64 floatx80_to_float64( floatx80 STATUS_PARAM );
  264 +#ifdef FLOAT128
  265 +float128 floatx80_to_float128( floatx80 STATUS_PARAM );
  266 +#endif
  267 +
  268 +/*----------------------------------------------------------------------------
  269 +| Software IEC/IEEE extended double-precision operations.
  270 +*----------------------------------------------------------------------------*/
  271 +floatx80 floatx80_round_to_int( floatx80 STATUS_PARAM );
  272 +floatx80 floatx80_add( floatx80, floatx80 STATUS_PARAM );
  273 +floatx80 floatx80_sub( floatx80, floatx80 STATUS_PARAM );
  274 +floatx80 floatx80_mul( floatx80, floatx80 STATUS_PARAM );
  275 +floatx80 floatx80_div( floatx80, floatx80 STATUS_PARAM );
  276 +floatx80 floatx80_rem( floatx80, floatx80 STATUS_PARAM );
  277 +floatx80 floatx80_sqrt( floatx80 STATUS_PARAM );
  278 +char floatx80_eq( floatx80, floatx80 STATUS_PARAM );
  279 +char floatx80_le( floatx80, floatx80 STATUS_PARAM );
  280 +char floatx80_lt( floatx80, floatx80 STATUS_PARAM );
  281 +char floatx80_eq_signaling( floatx80, floatx80 STATUS_PARAM );
  282 +char floatx80_le_quiet( floatx80, floatx80 STATUS_PARAM );
  283 +char floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
  284 +char floatx80_is_signaling_nan( floatx80 );
  285 +
  286 +#endif
  287 +
  288 +#ifdef FLOAT128
  289 +
  290 +/*----------------------------------------------------------------------------
  291 +| Software IEC/IEEE quadruple-precision conversion routines.
  292 +*----------------------------------------------------------------------------*/
  293 +int float128_to_int32( float128 STATUS_PARAM );
  294 +int float128_to_int32_round_to_zero( float128 STATUS_PARAM );
  295 +int64_t float128_to_int64( float128 STATUS_PARAM );
  296 +int64_t float128_to_int64_round_to_zero( float128 STATUS_PARAM );
  297 +float32 float128_to_float32( float128 STATUS_PARAM );
  298 +float64 float128_to_float64( float128 STATUS_PARAM );
  299 +#ifdef FLOATX80
  300 +floatx80 float128_to_floatx80( float128 STATUS_PARAM );
  301 +#endif
  302 +
  303 +/*----------------------------------------------------------------------------
  304 +| Software IEC/IEEE quadruple-precision operations.
  305 +*----------------------------------------------------------------------------*/
  306 +float128 float128_round_to_int( float128 STATUS_PARAM );
  307 +float128 float128_add( float128, float128 STATUS_PARAM );
  308 +float128 float128_sub( float128, float128 STATUS_PARAM );
  309 +float128 float128_mul( float128, float128 STATUS_PARAM );
  310 +float128 float128_div( float128, float128 STATUS_PARAM );
  311 +float128 float128_rem( float128, float128 STATUS_PARAM );
  312 +float128 float128_sqrt( float128 STATUS_PARAM );
  313 +char float128_eq( float128, float128 STATUS_PARAM );
  314 +char float128_le( float128, float128 STATUS_PARAM );
  315 +char float128_lt( float128, float128 STATUS_PARAM );
  316 +char float128_eq_signaling( float128, float128 STATUS_PARAM );
  317 +char float128_le_quiet( float128, float128 STATUS_PARAM );
  318 +char float128_lt_quiet( float128, float128 STATUS_PARAM );
  319 +char float128_is_signaling_nan( float128 );
  320 +
  321 +#endif
  322 +
  323 +#else /* CONFIG_SOFTFLOAT */
  324 +
  325 +#include "softfloat-native.h"
  326 +
  327 +#endif /* !CONFIG_SOFTFLOAT */
  328 +
  329 +#endif /* !SOFTFLOAT_H */
... ...