Source file src/math/big/float.go
1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements multi-precision floating-point numbers. 6 // Like in the GNU MPFR library (https://www.mpfr.org/), operands 7 // can be of mixed precision. Unlike MPFR, the rounding mode is 8 // not specified with each operation, but with each operand. The 9 // rounding mode of the result operand determines the rounding 10 // mode of an operation. This is a from-scratch implementation. 11 12 package big 13 14 import ( 15 "fmt" 16 "math" 17 "math/bits" 18 ) 19 20 const debugFloat = false // enable for debugging 21 22 // A nonzero finite Float represents a multi-precision floating point number 23 // 24 // sign × mantissa × 2**exponent 25 // 26 // with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp. 27 // A Float may also be zero (+0, -0) or infinite (+Inf, -Inf). 28 // All Floats are ordered, and the ordering of two Floats x and y 29 // is defined by x.Cmp(y). 30 // 31 // Each Float value also has a precision, rounding mode, and accuracy. 32 // The precision is the maximum number of mantissa bits available to 33 // represent the value. The rounding mode specifies how a result should 34 // be rounded to fit into the mantissa bits, and accuracy describes the 35 // rounding error with respect to the exact result. 36 // 37 // Unless specified otherwise, all operations (including setters) that 38 // specify a *Float variable for the result (usually via the receiver 39 // with the exception of [Float.MantExp]), round the numeric result according 40 // to the precision and rounding mode of the result variable. 41 // 42 // If the provided result precision is 0 (see below), it is set to the 43 // precision of the argument with the largest precision value before any 44 // rounding takes place, and the rounding mode remains unchanged. Thus, 45 // uninitialized Floats provided as result arguments will have their 46 // precision set to a reasonable value determined by the operands, and 47 // their mode is the zero value for RoundingMode (ToNearestEven). 48 // 49 // By setting the desired precision to 24 or 53 and using matching rounding 50 // mode (typically [ToNearestEven]), Float operations produce the same results 51 // as the corresponding float32 or float64 IEEE 754 arithmetic for operands 52 // that correspond to normal (i.e., not denormal) float32 or float64 numbers. 53 // Exponent underflow and overflow lead to a 0 or an Infinity for different 54 // values than IEEE 754 because Float exponents have a much larger range. 55 // 56 // The zero (uninitialized) value for a Float is ready to use and represents 57 // the number +0.0 exactly, with precision 0 and rounding mode [ToNearestEven]. 58 // 59 // Operations always take pointer arguments (*Float) rather 60 // than Float values, and each unique Float value requires 61 // its own unique *Float pointer. To "copy" a Float value, 62 // an existing (or newly allocated) Float must be set to 63 // a new value using the [Float.Set] method; shallow copies 64 // of Floats are not supported and may lead to errors. 65 type Float struct { 66 prec uint32 67 mode RoundingMode 68 acc Accuracy 69 form form 70 neg bool 71 mant nat 72 exp int32 73 } 74 75 // An ErrNaN panic is raised by a [Float] operation that would lead to 76 // a NaN under IEEE 754 rules. An ErrNaN implements the error interface. 77 type ErrNaN struct { 78 msg string 79 } 80 81 func (err ErrNaN) Error() string { 82 return err.msg 83 } 84 85 // NewFloat allocates and returns a new [Float] set to x, 86 // with precision 53 and rounding mode [ToNearestEven]. 87 // NewFloat panics with [ErrNaN] if x is a NaN. 88 func NewFloat(x float64) *Float { 89 if math.IsNaN(x) { 90 panic(ErrNaN{"NewFloat(NaN)"}) 91 } 92 return new(Float).SetFloat64(x) 93 } 94 95 // Exponent and precision limits. 96 const ( 97 MaxExp = math.MaxInt32 // largest supported exponent 98 MinExp = math.MinInt32 // smallest supported exponent 99 MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited 100 ) 101 102 // Internal representation: The mantissa bits x.mant of a nonzero finite 103 // Float x are stored in a nat slice long enough to hold up to x.prec bits; 104 // the slice may (but doesn't have to) be shorter if the mantissa contains 105 // trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e., 106 // the msb is shifted all the way "to the left"). Thus, if the mantissa has 107 // trailing 0 bits or x.prec is not a multiple of the Word size _W, 108 // x.mant[0] has trailing zero bits. The msb of the mantissa corresponds 109 // to the value 0.5; the exponent x.exp shifts the binary point as needed. 110 // 111 // A zero or non-finite Float x ignores x.mant and x.exp. 112 // 113 // x form neg mant exp 114 // ---------------------------------------------------------- 115 // ±0 zero sign - - 116 // 0 < |x| < +Inf finite sign mantissa exponent 117 // ±Inf inf sign - - 118 119 // A form value describes the internal representation. 120 type form byte 121 122 // The form value order is relevant - do not change! 123 const ( 124 zero form = iota 125 finite 126 inf 127 ) 128 129 // RoundingMode determines how a [Float] value is rounded to the 130 // desired precision. Rounding may change the [Float] value; the 131 // rounding error is described by the [Float]'s [Accuracy]. 132 type RoundingMode byte 133 134 // These constants define supported rounding modes. 135 const ( 136 ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven 137 ToNearestAway // == IEEE 754-2008 roundTiesToAway 138 ToZero // == IEEE 754-2008 roundTowardZero 139 AwayFromZero // no IEEE 754-2008 equivalent 140 ToNegativeInf // == IEEE 754-2008 roundTowardNegative 141 ToPositiveInf // == IEEE 754-2008 roundTowardPositive 142 ) 143 144 //go:generate stringer -type=RoundingMode 145 146 // Accuracy describes the rounding error produced by the most recent 147 // operation that generated a [Float] value, relative to the exact value. 148 type Accuracy int8 149 150 // Constants describing the [Accuracy] of a [Float]. 151 const ( 152 Below Accuracy = -1 153 Exact Accuracy = 0 154 Above Accuracy = +1 155 ) 156 157 //go:generate stringer -type=Accuracy 158 159 // SetPrec sets z's precision to prec and returns the (possibly) rounded 160 // value of z. Rounding occurs according to z's rounding mode if the mantissa 161 // cannot be represented in prec bits without loss of precision. 162 // SetPrec(0) maps all finite values to ±0; infinite values remain unchanged. 163 // If prec > [MaxPrec], it is set to [MaxPrec]. 164 func (z *Float) SetPrec(prec uint) *Float { 165 z.acc = Exact // optimistically assume no rounding is needed 166 167 // special case 168 if prec == 0 { 169 z.prec = 0 170 if z.form == finite { 171 // truncate z to 0 172 z.acc = makeAcc(z.neg) 173 z.form = zero 174 } 175 return z 176 } 177 178 // general case 179 if prec > MaxPrec { 180 prec = MaxPrec 181 } 182 old := z.prec 183 z.prec = uint32(prec) 184 if z.prec < old { 185 z.round(0) 186 } 187 return z 188 } 189 190 func makeAcc(above bool) Accuracy { 191 if above { 192 return Above 193 } 194 return Below 195 } 196 197 // SetMode sets z's rounding mode to mode and returns an exact z. 198 // z remains unchanged otherwise. 199 // z.SetMode(z.Mode()) is a cheap way to set z's accuracy to [Exact]. 200 func (z *Float) SetMode(mode RoundingMode) *Float { 201 z.mode = mode 202 z.acc = Exact 203 return z 204 } 205 206 // Prec returns the mantissa precision of x in bits. 207 // The result may be 0 for |x| == 0 and |x| == Inf. 208 func (x *Float) Prec() uint { 209 return uint(x.prec) 210 } 211 212 // MinPrec returns the minimum precision required to represent x exactly 213 // (i.e., the smallest prec before x.SetPrec(prec) would start rounding x). 214 // The result is 0 for |x| == 0 and |x| == Inf. 215 func (x *Float) MinPrec() uint { 216 if x.form != finite { 217 return 0 218 } 219 return uint(len(x.mant))*_W - x.mant.trailingZeroBits() 220 } 221 222 // Mode returns the rounding mode of x. 223 func (x *Float) Mode() RoundingMode { 224 return x.mode 225 } 226 227 // Acc returns the accuracy of x produced by the most recent 228 // operation, unless explicitly documented otherwise by that 229 // operation. 230 func (x *Float) Acc() Accuracy { 231 return x.acc 232 } 233 234 // Sign returns: 235 // - -1 if x < 0; 236 // - 0 if x is ±0; 237 // - +1 if x > 0. 238 func (x *Float) Sign() int { 239 if debugFloat { 240 x.validate() 241 } 242 if x.form == zero { 243 return 0 244 } 245 if x.neg { 246 return -1 247 } 248 return 1 249 } 250 251 // MantExp breaks x into its mantissa and exponent components 252 // and returns the exponent. If a non-nil mant argument is 253 // provided its value is set to the mantissa of x, with the 254 // same precision and rounding mode as x. The components 255 // satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0. 256 // Calling MantExp with a nil argument is an efficient way to 257 // get the exponent of the receiver. 258 // 259 // Special cases are: 260 // 261 // ( ±0).MantExp(mant) = 0, with mant set to ±0 262 // (±Inf).MantExp(mant) = 0, with mant set to ±Inf 263 // 264 // x and mant may be the same in which case x is set to its 265 // mantissa value. 266 func (x *Float) MantExp(mant *Float) (exp int) { 267 if debugFloat { 268 x.validate() 269 } 270 if x.form == finite { 271 exp = int(x.exp) 272 } 273 if mant != nil { 274 mant.Copy(x) 275 if mant.form == finite { 276 mant.exp = 0 277 } 278 } 279 return 280 } 281 282 func (z *Float) setExpAndRound(exp int64, sbit uint) { 283 if exp < MinExp { 284 // underflow 285 z.acc = makeAcc(z.neg) 286 z.form = zero 287 return 288 } 289 290 if exp > MaxExp { 291 // overflow 292 z.acc = makeAcc(!z.neg) 293 z.form = inf 294 return 295 } 296 297 z.form = finite 298 z.exp = int32(exp) 299 z.round(sbit) 300 } 301 302 // SetMantExp sets z to mant × 2**exp and returns z. 303 // The result z has the same precision and rounding mode 304 // as mant. SetMantExp is an inverse of [Float.MantExp] but does 305 // not require 0.5 <= |mant| < 1.0. Specifically, for a 306 // given x of type *[Float], SetMantExp relates to [Float.MantExp] 307 // as follows: 308 // 309 // mant := new(Float) 310 // new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0 311 // 312 // Special cases are: 313 // 314 // z.SetMantExp( ±0, exp) = ±0 315 // z.SetMantExp(±Inf, exp) = ±Inf 316 // 317 // z and mant may be the same in which case z's exponent 318 // is set to exp. 319 func (z *Float) SetMantExp(mant *Float, exp int) *Float { 320 if debugFloat { 321 z.validate() 322 mant.validate() 323 } 324 z.Copy(mant) 325 326 if z.form == finite { 327 // 0 < |mant| < +Inf 328 z.setExpAndRound(int64(z.exp)+int64(exp), 0) 329 } 330 return z 331 } 332 333 // Signbit reports whether x is negative or negative zero. 334 func (x *Float) Signbit() bool { 335 return x.neg 336 } 337 338 // IsInf reports whether x is +Inf or -Inf. 339 func (x *Float) IsInf() bool { 340 return x.form == inf 341 } 342 343 // IsInt reports whether x is an integer. 344 // ±Inf values are not integers. 345 func (x *Float) IsInt() bool { 346 if debugFloat { 347 x.validate() 348 } 349 // special cases 350 if x.form != finite { 351 return x.form == zero 352 } 353 // x.form == finite 354 if x.exp <= 0 { 355 return false 356 } 357 // x.exp > 0 358 return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa 359 } 360 361 // debugging support 362 func (x *Float) validate() { 363 if !debugFloat { 364 // avoid performance bugs 365 panic("validate called but debugFloat is not set") 366 } 367 if msg := x.validate0(); msg != "" { 368 panic(msg) 369 } 370 } 371 372 func (x *Float) validate0() string { 373 if x.form != finite { 374 return "" 375 } 376 m := len(x.mant) 377 if m == 0 { 378 return "nonzero finite number with empty mantissa" 379 } 380 const msb = 1 << (_W - 1) 381 if x.mant[m-1]&msb == 0 { 382 return fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)) 383 } 384 if x.prec == 0 { 385 return "zero precision finite number" 386 } 387 return "" 388 } 389 390 // round rounds z according to z.mode to z.prec bits and sets z.acc accordingly. 391 // sbit must be 0 or 1 and summarizes any "sticky bit" information one might 392 // have before calling round. z's mantissa must be normalized (with the msb set) 393 // or empty. 394 // 395 // CAUTION: The rounding modes [ToNegativeInf], [ToPositiveInf] are affected by the 396 // sign of z. For correct rounding, the sign of z must be set correctly before 397 // calling round. 398 func (z *Float) round(sbit uint) { 399 if debugFloat { 400 z.validate() 401 } 402 403 z.acc = Exact 404 if z.form != finite { 405 // ±0 or ±Inf => nothing left to do 406 return 407 } 408 // z.form == finite && len(z.mant) > 0 409 // m > 0 implies z.prec > 0 (checked by validate) 410 411 m := uint32(len(z.mant)) // present mantissa length in words 412 bits := m * _W // present mantissa bits; bits > 0 413 if bits <= z.prec { 414 // mantissa fits => nothing to do 415 return 416 } 417 // bits > z.prec 418 419 // Rounding is based on two bits: the rounding bit (rbit) and the 420 // sticky bit (sbit). The rbit is the bit immediately before the 421 // z.prec leading mantissa bits (the "0.5"). The sbit is set if any 422 // of the bits before the rbit are set (the "0.25", "0.125", etc.): 423 // 424 // rbit sbit => "fractional part" 425 // 426 // 0 0 == 0 427 // 0 1 > 0 , < 0.5 428 // 1 0 == 0.5 429 // 1 1 > 0.5, < 1.0 430 431 // bits > z.prec: mantissa too large => round 432 r := uint(bits - z.prec - 1) // rounding bit position; r >= 0 433 rbit := z.mant.bit(r) & 1 // rounding bit; be safe and ensure it's a single bit 434 // The sticky bit is only needed for rounding ToNearestEven 435 // or when the rounding bit is zero. Avoid computation otherwise. 436 if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) { 437 sbit = z.mant.sticky(r) 438 } 439 sbit &= 1 // be safe and ensure it's a single bit 440 441 // cut off extra words 442 n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision 443 if m > n { 444 copy(z.mant, z.mant[m-n:]) // move n last words to front 445 z.mant = z.mant[:n] 446 } 447 448 // determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word 449 ntz := n*_W - z.prec // 0 <= ntz < _W 450 lsb := Word(1) << ntz 451 452 // round if result is inexact 453 if rbit|sbit != 0 { 454 // Make rounding decision: The result mantissa is truncated ("rounded down") 455 // by default. Decide if we need to increment, or "round up", the (unsigned) 456 // mantissa. 457 inc := false 458 switch z.mode { 459 case ToNegativeInf: 460 inc = z.neg 461 case ToZero: 462 // nothing to do 463 case ToNearestEven: 464 inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0) 465 case ToNearestAway: 466 inc = rbit != 0 467 case AwayFromZero: 468 inc = true 469 case ToPositiveInf: 470 inc = !z.neg 471 default: 472 panic("unreachable") 473 } 474 475 // A positive result (!z.neg) is Above the exact result if we increment, 476 // and it's Below if we truncate (Exact results require no rounding). 477 // For a negative result (z.neg) it is exactly the opposite. 478 z.acc = makeAcc(inc != z.neg) 479 480 if inc { 481 // add 1 to mantissa 482 if addVW(z.mant, z.mant, lsb) != 0 { 483 // mantissa overflow => adjust exponent 484 if z.exp >= MaxExp { 485 // exponent overflow 486 z.form = inf 487 return 488 } 489 z.exp++ 490 // adjust mantissa: divide by 2 to compensate for exponent adjustment 491 shrVU(z.mant, z.mant, 1) 492 // set msb == carry == 1 from the mantissa overflow above 493 const msb = 1 << (_W - 1) 494 z.mant[n-1] |= msb 495 } 496 } 497 } 498 499 // zero out trailing bits in least-significant word 500 z.mant[0] &^= lsb - 1 501 502 if debugFloat { 503 z.validate() 504 } 505 } 506 507 func (z *Float) setBits64(neg bool, x uint64) *Float { 508 if z.prec == 0 { 509 z.prec = 64 510 } 511 z.acc = Exact 512 z.neg = neg 513 if x == 0 { 514 z.form = zero 515 return z 516 } 517 // x != 0 518 z.form = finite 519 s := bits.LeadingZeros64(x) 520 z.mant = z.mant.setUint64(x << uint(s)) 521 z.exp = int32(64 - s) // always fits 522 if z.prec < 64 { 523 z.round(0) 524 } 525 return z 526 } 527 528 // SetUint64 sets z to the (possibly rounded) value of x and returns z. 529 // If z's precision is 0, it is changed to 64 (and rounding will have 530 // no effect). 531 func (z *Float) SetUint64(x uint64) *Float { 532 return z.setBits64(false, x) 533 } 534 535 // SetInt64 sets z to the (possibly rounded) value of x and returns z. 536 // If z's precision is 0, it is changed to 64 (and rounding will have 537 // no effect). 538 func (z *Float) SetInt64(x int64) *Float { 539 u := x 540 if u < 0 { 541 u = -u 542 } 543 // We cannot simply call z.SetUint64(uint64(u)) and change 544 // the sign afterwards because the sign affects rounding. 545 return z.setBits64(x < 0, uint64(u)) 546 } 547 548 // SetFloat64 sets z to the (possibly rounded) value of x and returns z. 549 // If z's precision is 0, it is changed to 53 (and rounding will have 550 // no effect). SetFloat64 panics with [ErrNaN] if x is a NaN. 551 func (z *Float) SetFloat64(x float64) *Float { 552 if z.prec == 0 { 553 z.prec = 53 554 } 555 if math.IsNaN(x) { 556 panic(ErrNaN{"Float.SetFloat64(NaN)"}) 557 } 558 z.acc = Exact 559 z.neg = math.Signbit(x) // handle -0, -Inf correctly 560 if x == 0 { 561 z.form = zero 562 return z 563 } 564 if math.IsInf(x, 0) { 565 z.form = inf 566 return z 567 } 568 // normalized x != 0 569 z.form = finite 570 fmant, exp := math.Frexp(x) // get normalized mantissa 571 z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11) 572 z.exp = int32(exp) // always fits 573 if z.prec < 53 { 574 z.round(0) 575 } 576 return z 577 } 578 579 // fnorm normalizes mantissa m by shifting it to the left 580 // such that the msb of the most-significant word (msw) is 1. 581 // It returns the shift amount. It assumes that len(m) != 0. 582 func fnorm(m nat) int64 { 583 if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) { 584 panic("msw of mantissa is 0") 585 } 586 s := nlz(m[len(m)-1]) 587 if s > 0 { 588 c := shlVU(m, m, s) 589 if debugFloat && c != 0 { 590 panic("nlz or shlVU incorrect") 591 } 592 } 593 return int64(s) 594 } 595 596 // SetInt sets z to the (possibly rounded) value of x and returns z. 597 // If z's precision is 0, it is changed to the larger of x.BitLen() 598 // or 64 (and rounding will have no effect). 599 func (z *Float) SetInt(x *Int) *Float { 600 // TODO(gri) can be more efficient if z.prec > 0 601 // but small compared to the size of x, or if there 602 // are many trailing 0's. 603 bits := uint32(x.BitLen()) 604 if z.prec == 0 { 605 z.prec = umax32(bits, 64) 606 } 607 z.acc = Exact 608 z.neg = x.neg 609 if len(x.abs) == 0 { 610 z.form = zero 611 return z 612 } 613 // x != 0 614 z.mant = z.mant.set(x.abs) 615 fnorm(z.mant) 616 z.setExpAndRound(int64(bits), 0) 617 return z 618 } 619 620 // SetRat sets z to the (possibly rounded) value of x and returns z. 621 // If z's precision is 0, it is changed to the largest of a.BitLen(), 622 // b.BitLen(), or 64; with x = a/b. 623 func (z *Float) SetRat(x *Rat) *Float { 624 if x.IsInt() { 625 return z.SetInt(x.Num()) 626 } 627 var a, b Float 628 a.SetInt(x.Num()) 629 b.SetInt(x.Denom()) 630 if z.prec == 0 { 631 z.prec = umax32(a.prec, b.prec) 632 } 633 return z.Quo(&a, &b) 634 } 635 636 // SetInf sets z to the infinite Float -Inf if signbit is 637 // set, or +Inf if signbit is not set, and returns z. The 638 // precision of z is unchanged and the result is always 639 // [Exact]. 640 func (z *Float) SetInf(signbit bool) *Float { 641 z.acc = Exact 642 z.form = inf 643 z.neg = signbit 644 return z 645 } 646 647 // Set sets z to the (possibly rounded) value of x and returns z. 648 // If z's precision is 0, it is changed to the precision of x 649 // before setting z (and rounding will have no effect). 650 // Rounding is performed according to z's precision and rounding 651 // mode; and z's accuracy reports the result error relative to the 652 // exact (not rounded) result. 653 func (z *Float) Set(x *Float) *Float { 654 if debugFloat { 655 x.validate() 656 } 657 z.acc = Exact 658 if z != x { 659 z.form = x.form 660 z.neg = x.neg 661 if x.form == finite { 662 z.exp = x.exp 663 z.mant = z.mant.set(x.mant) 664 } 665 if z.prec == 0 { 666 z.prec = x.prec 667 } else if z.prec < x.prec { 668 z.round(0) 669 } 670 } 671 return z 672 } 673 674 // Copy sets z to x, with the same precision, rounding mode, and accuracy as x. 675 // Copy returns z. If x and z are identical, Copy is a no-op. 676 func (z *Float) Copy(x *Float) *Float { 677 if debugFloat { 678 x.validate() 679 } 680 if z != x { 681 z.prec = x.prec 682 z.mode = x.mode 683 z.acc = x.acc 684 z.form = x.form 685 z.neg = x.neg 686 if z.form == finite { 687 z.mant = z.mant.set(x.mant) 688 z.exp = x.exp 689 } 690 } 691 return z 692 } 693 694 // msb32 returns the 32 most significant bits of x. 695 func msb32(x nat) uint32 { 696 i := len(x) - 1 697 if i < 0 { 698 return 0 699 } 700 if debugFloat && x[i]&(1<<(_W-1)) == 0 { 701 panic("x not normalized") 702 } 703 switch _W { 704 case 32: 705 return uint32(x[i]) 706 case 64: 707 return uint32(x[i] >> 32) 708 } 709 panic("unreachable") 710 } 711 712 // msb64 returns the 64 most significant bits of x. 713 func msb64(x nat) uint64 { 714 i := len(x) - 1 715 if i < 0 { 716 return 0 717 } 718 if debugFloat && x[i]&(1<<(_W-1)) == 0 { 719 panic("x not normalized") 720 } 721 switch _W { 722 case 32: 723 v := uint64(x[i]) << 32 724 if i > 0 { 725 v |= uint64(x[i-1]) 726 } 727 return v 728 case 64: 729 return uint64(x[i]) 730 } 731 panic("unreachable") 732 } 733 734 // Uint64 returns the unsigned integer resulting from truncating x 735 // towards zero. If 0 <= x <= [math.MaxUint64], the result is [Exact] 736 // if x is an integer and [Below] otherwise. 737 // The result is (0, [Above]) for x < 0, and ([math.MaxUint64], [Below]) 738 // for x > [math.MaxUint64]. 739 func (x *Float) Uint64() (uint64, Accuracy) { 740 if debugFloat { 741 x.validate() 742 } 743 744 switch x.form { 745 case finite: 746 if x.neg { 747 return 0, Above 748 } 749 // 0 < x < +Inf 750 if x.exp <= 0 { 751 // 0 < x < 1 752 return 0, Below 753 } 754 // 1 <= x < Inf 755 if x.exp <= 64 { 756 // u = trunc(x) fits into a uint64 757 u := msb64(x.mant) >> (64 - uint32(x.exp)) 758 if x.MinPrec() <= 64 { 759 return u, Exact 760 } 761 return u, Below // x truncated 762 } 763 // x too large 764 return math.MaxUint64, Below 765 766 case zero: 767 return 0, Exact 768 769 case inf: 770 if x.neg { 771 return 0, Above 772 } 773 return math.MaxUint64, Below 774 } 775 776 panic("unreachable") 777 } 778 779 // Int64 returns the integer resulting from truncating x towards zero. 780 // If [math.MinInt64] <= x <= [math.MaxInt64], the result is [Exact] if x is 781 // an integer, and [Above] (x < 0) or [Below] (x > 0) otherwise. 782 // The result is ([math.MinInt64], [Above]) for x < [math.MinInt64], 783 // and ([math.MaxInt64], [Below]) for x > [math.MaxInt64]. 784 func (x *Float) Int64() (int64, Accuracy) { 785 if debugFloat { 786 x.validate() 787 } 788 789 switch x.form { 790 case finite: 791 // 0 < |x| < +Inf 792 acc := makeAcc(x.neg) 793 if x.exp <= 0 { 794 // 0 < |x| < 1 795 return 0, acc 796 } 797 // x.exp > 0 798 799 // 1 <= |x| < +Inf 800 if x.exp <= 63 { 801 // i = trunc(x) fits into an int64 (excluding math.MinInt64) 802 i := int64(msb64(x.mant) >> (64 - uint32(x.exp))) 803 if x.neg { 804 i = -i 805 } 806 if x.MinPrec() <= uint(x.exp) { 807 return i, Exact 808 } 809 return i, acc // x truncated 810 } 811 if x.neg { 812 // check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64)) 813 if x.exp == 64 && x.MinPrec() == 1 { 814 acc = Exact 815 } 816 return math.MinInt64, acc 817 } 818 // x too large 819 return math.MaxInt64, Below 820 821 case zero: 822 return 0, Exact 823 824 case inf: 825 if x.neg { 826 return math.MinInt64, Above 827 } 828 return math.MaxInt64, Below 829 } 830 831 panic("unreachable") 832 } 833 834 // Float32 returns the float32 value nearest to x. If x is too small to be 835 // represented by a float32 (|x| < [math.SmallestNonzeroFloat32]), the result 836 // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x. 837 // If x is too large to be represented by a float32 (|x| > [math.MaxFloat32]), 838 // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x. 839 func (x *Float) Float32() (float32, Accuracy) { 840 if debugFloat { 841 x.validate() 842 } 843 844 switch x.form { 845 case finite: 846 // 0 < |x| < +Inf 847 848 const ( 849 fbits = 32 // float size 850 mbits = 23 // mantissa size (excluding implicit msb) 851 ebits = fbits - mbits - 1 // 8 exponent size 852 bias = 1<<(ebits-1) - 1 // 127 exponent bias 853 dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal) 854 emin = 1 - bias // -126 smallest unbiased exponent (normal) 855 emax = bias // 127 largest unbiased exponent (normal) 856 ) 857 858 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa. 859 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 860 861 // Compute precision p for float32 mantissa. 862 // If the exponent is too small, we have a denormal number before 863 // rounding and fewer than p mantissa bits of precision available 864 // (the exponent remains fixed but the mantissa gets shifted right). 865 p := mbits + 1 // precision of normal float 866 if e < emin { 867 // recompute precision 868 p = mbits + 1 - emin + int(e) 869 // If p == 0, the mantissa of x is shifted so much to the right 870 // that its msb falls immediately to the right of the float32 871 // mantissa space. In other words, if the smallest denormal is 872 // considered "1.0", for p == 0, the mantissa value m is >= 0.5. 873 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. 874 // If m == 0.5, it is rounded down to even, i.e., 0.0. 875 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. 876 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { 877 // underflow to ±0 878 if x.neg { 879 var z float32 880 return -z, Above 881 } 882 return 0.0, Below 883 } 884 // otherwise, round up 885 // We handle p == 0 explicitly because it's easy and because 886 // Float.round doesn't support rounding to 0 bits of precision. 887 if p == 0 { 888 if x.neg { 889 return -math.SmallestNonzeroFloat32, Below 890 } 891 return math.SmallestNonzeroFloat32, Above 892 } 893 } 894 // p > 0 895 896 // round 897 var r Float 898 r.prec = uint32(p) 899 r.Set(x) 900 e = r.exp - 1 901 902 // Rounding may have caused r to overflow to ±Inf 903 // (rounding never causes underflows to 0). 904 // If the exponent is too large, also overflow to ±Inf. 905 if r.form == inf || e > emax { 906 // overflow 907 if x.neg { 908 return float32(math.Inf(-1)), Below 909 } 910 return float32(math.Inf(+1)), Above 911 } 912 // e <= emax 913 914 // Determine sign, biased exponent, and mantissa. 915 var sign, bexp, mant uint32 916 if x.neg { 917 sign = 1 << (fbits - 1) 918 } 919 920 // Rounding may have caused a denormal number to 921 // become normal. Check again. 922 if e < emin { 923 // denormal number: recompute precision 924 // Since rounding may have at best increased precision 925 // and we have eliminated p <= 0 early, we know p > 0. 926 // bexp == 0 for denormals 927 p = mbits + 1 - emin + int(e) 928 mant = msb32(r.mant) >> uint(fbits-p) 929 } else { 930 // normal number: emin <= e <= emax 931 bexp = uint32(e+bias) << mbits 932 mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) 933 } 934 935 return math.Float32frombits(sign | bexp | mant), r.acc 936 937 case zero: 938 if x.neg { 939 var z float32 940 return -z, Exact 941 } 942 return 0.0, Exact 943 944 case inf: 945 if x.neg { 946 return float32(math.Inf(-1)), Exact 947 } 948 return float32(math.Inf(+1)), Exact 949 } 950 951 panic("unreachable") 952 } 953 954 // Float64 returns the float64 value nearest to x. If x is too small to be 955 // represented by a float64 (|x| < [math.SmallestNonzeroFloat64]), the result 956 // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x. 957 // If x is too large to be represented by a float64 (|x| > [math.MaxFloat64]), 958 // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x. 959 func (x *Float) Float64() (float64, Accuracy) { 960 if debugFloat { 961 x.validate() 962 } 963 964 switch x.form { 965 case finite: 966 // 0 < |x| < +Inf 967 968 const ( 969 fbits = 64 // float size 970 mbits = 52 // mantissa size (excluding implicit msb) 971 ebits = fbits - mbits - 1 // 11 exponent size 972 bias = 1<<(ebits-1) - 1 // 1023 exponent bias 973 dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal) 974 emin = 1 - bias // -1022 smallest unbiased exponent (normal) 975 emax = bias // 1023 largest unbiased exponent (normal) 976 ) 977 978 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa. 979 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 980 981 // Compute precision p for float64 mantissa. 982 // If the exponent is too small, we have a denormal number before 983 // rounding and fewer than p mantissa bits of precision available 984 // (the exponent remains fixed but the mantissa gets shifted right). 985 p := mbits + 1 // precision of normal float 986 if e < emin { 987 // recompute precision 988 p = mbits + 1 - emin + int(e) 989 // If p == 0, the mantissa of x is shifted so much to the right 990 // that its msb falls immediately to the right of the float64 991 // mantissa space. In other words, if the smallest denormal is 992 // considered "1.0", for p == 0, the mantissa value m is >= 0.5. 993 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. 994 // If m == 0.5, it is rounded down to even, i.e., 0.0. 995 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. 996 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { 997 // underflow to ±0 998 if x.neg { 999 var z float64 1000 return -z, Above 1001 } 1002 return 0.0, Below 1003 } 1004 // otherwise, round up 1005 // We handle p == 0 explicitly because it's easy and because 1006 // Float.round doesn't support rounding to 0 bits of precision. 1007 if p == 0 { 1008 if x.neg { 1009 return -math.SmallestNonzeroFloat64, Below 1010 } 1011 return math.SmallestNonzeroFloat64, Above 1012 } 1013 } 1014 // p > 0 1015 1016 // round 1017 var r Float 1018 r.prec = uint32(p) 1019 r.Set(x) 1020 e = r.exp - 1 1021 1022 // Rounding may have caused r to overflow to ±Inf 1023 // (rounding never causes underflows to 0). 1024 // If the exponent is too large, also overflow to ±Inf. 1025 if r.form == inf || e > emax { 1026 // overflow 1027 if x.neg { 1028 return math.Inf(-1), Below 1029 } 1030 return math.Inf(+1), Above 1031 } 1032 // e <= emax 1033 1034 // Determine sign, biased exponent, and mantissa. 1035 var sign, bexp, mant uint64 1036 if x.neg { 1037 sign = 1 << (fbits - 1) 1038 } 1039 1040 // Rounding may have caused a denormal number to 1041 // become normal. Check again. 1042 if e < emin { 1043 // denormal number: recompute precision 1044 // Since rounding may have at best increased precision 1045 // and we have eliminated p <= 0 early, we know p > 0. 1046 // bexp == 0 for denormals 1047 p = mbits + 1 - emin + int(e) 1048 mant = msb64(r.mant) >> uint(fbits-p) 1049 } else { 1050 // normal number: emin <= e <= emax 1051 bexp = uint64(e+bias) << mbits 1052 mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) 1053 } 1054 1055 return math.Float64frombits(sign | bexp | mant), r.acc 1056 1057 case zero: 1058 if x.neg { 1059 var z float64 1060 return -z, Exact 1061 } 1062 return 0.0, Exact 1063 1064 case inf: 1065 if x.neg { 1066 return math.Inf(-1), Exact 1067 } 1068 return math.Inf(+1), Exact 1069 } 1070 1071 panic("unreachable") 1072 } 1073 1074 // Int returns the result of truncating x towards zero; 1075 // or nil if x is an infinity. 1076 // The result is [Exact] if x.IsInt(); otherwise it is [Below] 1077 // for x > 0, and [Above] for x < 0. 1078 // If a non-nil *[Int] argument z is provided, [Int] stores 1079 // the result in z instead of allocating a new [Int]. 1080 func (x *Float) Int(z *Int) (*Int, Accuracy) { 1081 if debugFloat { 1082 x.validate() 1083 } 1084 1085 if z == nil && x.form <= finite { 1086 z = new(Int) 1087 } 1088 1089 switch x.form { 1090 case finite: 1091 // 0 < |x| < +Inf 1092 acc := makeAcc(x.neg) 1093 if x.exp <= 0 { 1094 // 0 < |x| < 1 1095 return z.SetInt64(0), acc 1096 } 1097 // x.exp > 0 1098 1099 // 1 <= |x| < +Inf 1100 // determine minimum required precision for x 1101 allBits := uint(len(x.mant)) * _W 1102 exp := uint(x.exp) 1103 if x.MinPrec() <= exp { 1104 acc = Exact 1105 } 1106 // shift mantissa as needed 1107 if z == nil { 1108 z = new(Int) 1109 } 1110 z.neg = x.neg 1111 switch { 1112 case exp > allBits: 1113 z.abs = z.abs.shl(x.mant, exp-allBits) 1114 default: 1115 z.abs = z.abs.set(x.mant) 1116 case exp < allBits: 1117 z.abs = z.abs.shr(x.mant, allBits-exp) 1118 } 1119 return z, acc 1120 1121 case zero: 1122 return z.SetInt64(0), Exact 1123 1124 case inf: 1125 return nil, makeAcc(x.neg) 1126 } 1127 1128 panic("unreachable") 1129 } 1130 1131 // Rat returns the rational number corresponding to x; 1132 // or nil if x is an infinity. 1133 // The result is [Exact] if x is not an Inf. 1134 // If a non-nil *[Rat] argument z is provided, [Rat] stores 1135 // the result in z instead of allocating a new [Rat]. 1136 func (x *Float) Rat(z *Rat) (*Rat, Accuracy) { 1137 if debugFloat { 1138 x.validate() 1139 } 1140 1141 if z == nil && x.form <= finite { 1142 z = new(Rat) 1143 } 1144 1145 switch x.form { 1146 case finite: 1147 // 0 < |x| < +Inf 1148 allBits := int32(len(x.mant)) * _W 1149 // build up numerator and denominator 1150 z.a.neg = x.neg 1151 switch { 1152 case x.exp > allBits: 1153 z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits)) 1154 z.b.abs = z.b.abs[:0] // == 1 (see Rat) 1155 // z already in normal form 1156 default: 1157 z.a.abs = z.a.abs.set(x.mant) 1158 z.b.abs = z.b.abs[:0] // == 1 (see Rat) 1159 // z already in normal form 1160 case x.exp < allBits: 1161 z.a.abs = z.a.abs.set(x.mant) 1162 t := z.b.abs.setUint64(1) 1163 z.b.abs = t.shl(t, uint(allBits-x.exp)) 1164 z.norm() 1165 } 1166 return z, Exact 1167 1168 case zero: 1169 return z.SetInt64(0), Exact 1170 1171 case inf: 1172 return nil, makeAcc(x.neg) 1173 } 1174 1175 panic("unreachable") 1176 } 1177 1178 // Abs sets z to the (possibly rounded) value |x| (the absolute value of x) 1179 // and returns z. 1180 func (z *Float) Abs(x *Float) *Float { 1181 z.Set(x) 1182 z.neg = false 1183 return z 1184 } 1185 1186 // Neg sets z to the (possibly rounded) value of x with its sign negated, 1187 // and returns z. 1188 func (z *Float) Neg(x *Float) *Float { 1189 z.Set(x) 1190 z.neg = !z.neg 1191 return z 1192 } 1193 1194 func validateBinaryOperands(x, y *Float) { 1195 if !debugFloat { 1196 // avoid performance bugs 1197 panic("validateBinaryOperands called but debugFloat is not set") 1198 } 1199 if len(x.mant) == 0 { 1200 panic("empty mantissa for x") 1201 } 1202 if len(y.mant) == 0 { 1203 panic("empty mantissa for y") 1204 } 1205 } 1206 1207 // z = x + y, ignoring signs of x and y for the addition 1208 // but using the sign of z for rounding the result. 1209 // x and y must have a non-empty mantissa and valid exponent. 1210 func (z *Float) uadd(x, y *Float) { 1211 // Note: This implementation requires 2 shifts most of the 1212 // time. It is also inefficient if exponents or precisions 1213 // differ by wide margins. The following article describes 1214 // an efficient (but much more complicated) implementation 1215 // compatible with the internal representation used here: 1216 // 1217 // Vincent Lefèvre: "The Generic Multiple-Precision Floating- 1218 // Point Addition With Exact Rounding (as in the MPFR Library)" 1219 // http://www.vinc17.net/research/papers/rnc6.pdf 1220 1221 if debugFloat { 1222 validateBinaryOperands(x, y) 1223 } 1224 1225 // compute exponents ex, ey for mantissa with "binary point" 1226 // on the right (mantissa.0) - use int64 to avoid overflow 1227 ex := int64(x.exp) - int64(len(x.mant))*_W 1228 ey := int64(y.exp) - int64(len(y.mant))*_W 1229 1230 al := alias(z.mant, x.mant) || alias(z.mant, y.mant) 1231 1232 // TODO(gri) having a combined add-and-shift primitive 1233 // could make this code significantly faster 1234 switch { 1235 case ex < ey: 1236 if al { 1237 t := nat(nil).shl(y.mant, uint(ey-ex)) 1238 z.mant = z.mant.add(x.mant, t) 1239 } else { 1240 z.mant = z.mant.shl(y.mant, uint(ey-ex)) 1241 z.mant = z.mant.add(x.mant, z.mant) 1242 } 1243 default: 1244 // ex == ey, no shift needed 1245 z.mant = z.mant.add(x.mant, y.mant) 1246 case ex > ey: 1247 if al { 1248 t := nat(nil).shl(x.mant, uint(ex-ey)) 1249 z.mant = z.mant.add(t, y.mant) 1250 } else { 1251 z.mant = z.mant.shl(x.mant, uint(ex-ey)) 1252 z.mant = z.mant.add(z.mant, y.mant) 1253 } 1254 ex = ey 1255 } 1256 // len(z.mant) > 0 1257 1258 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) 1259 } 1260 1261 // z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction 1262 // but using the sign of z for rounding the result. 1263 // x and y must have a non-empty mantissa and valid exponent. 1264 func (z *Float) usub(x, y *Float) { 1265 // This code is symmetric to uadd. 1266 // We have not factored the common code out because 1267 // eventually uadd (and usub) should be optimized 1268 // by special-casing, and the code will diverge. 1269 1270 if debugFloat { 1271 validateBinaryOperands(x, y) 1272 } 1273 1274 ex := int64(x.exp) - int64(len(x.mant))*_W 1275 ey := int64(y.exp) - int64(len(y.mant))*_W 1276 1277 al := alias(z.mant, x.mant) || alias(z.mant, y.mant) 1278 1279 switch { 1280 case ex < ey: 1281 if al { 1282 t := nat(nil).shl(y.mant, uint(ey-ex)) 1283 z.mant = t.sub(x.mant, t) 1284 } else { 1285 z.mant = z.mant.shl(y.mant, uint(ey-ex)) 1286 z.mant = z.mant.sub(x.mant, z.mant) 1287 } 1288 default: 1289 // ex == ey, no shift needed 1290 z.mant = z.mant.sub(x.mant, y.mant) 1291 case ex > ey: 1292 if al { 1293 t := nat(nil).shl(x.mant, uint(ex-ey)) 1294 z.mant = t.sub(t, y.mant) 1295 } else { 1296 z.mant = z.mant.shl(x.mant, uint(ex-ey)) 1297 z.mant = z.mant.sub(z.mant, y.mant) 1298 } 1299 ex = ey 1300 } 1301 1302 // operands may have canceled each other out 1303 if len(z.mant) == 0 { 1304 z.acc = Exact 1305 z.form = zero 1306 z.neg = false 1307 return 1308 } 1309 // len(z.mant) > 0 1310 1311 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) 1312 } 1313 1314 // z = x * y, ignoring signs of x and y for the multiplication 1315 // but using the sign of z for rounding the result. 1316 // x and y must have a non-empty mantissa and valid exponent. 1317 func (z *Float) umul(x, y *Float) { 1318 if debugFloat { 1319 validateBinaryOperands(x, y) 1320 } 1321 1322 // Note: This is doing too much work if the precision 1323 // of z is less than the sum of the precisions of x 1324 // and y which is often the case (e.g., if all floats 1325 // have the same precision). 1326 // TODO(gri) Optimize this for the common case. 1327 1328 e := int64(x.exp) + int64(y.exp) 1329 if x == y { 1330 z.mant = z.mant.sqr(x.mant) 1331 } else { 1332 z.mant = z.mant.mul(x.mant, y.mant) 1333 } 1334 z.setExpAndRound(e-fnorm(z.mant), 0) 1335 } 1336 1337 // z = x / y, ignoring signs of x and y for the division 1338 // but using the sign of z for rounding the result. 1339 // x and y must have a non-empty mantissa and valid exponent. 1340 func (z *Float) uquo(x, y *Float) { 1341 if debugFloat { 1342 validateBinaryOperands(x, y) 1343 } 1344 1345 // mantissa length in words for desired result precision + 1 1346 // (at least one extra bit so we get the rounding bit after 1347 // the division) 1348 n := int(z.prec/_W) + 1 1349 1350 // compute adjusted x.mant such that we get enough result precision 1351 xadj := x.mant 1352 if d := n - len(x.mant) + len(y.mant); d > 0 { 1353 // d extra words needed => add d "0 digits" to x 1354 xadj = make(nat, len(x.mant)+d) 1355 copy(xadj[d:], x.mant) 1356 } 1357 // TODO(gri): If we have too many digits (d < 0), we should be able 1358 // to shorten x for faster division. But we must be extra careful 1359 // with rounding in that case. 1360 1361 // Compute d before division since there may be aliasing of x.mant 1362 // (via xadj) or y.mant with z.mant. 1363 d := len(xadj) - len(y.mant) 1364 1365 // divide 1366 var r nat 1367 z.mant, r = z.mant.div(nil, xadj, y.mant) 1368 e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W 1369 1370 // The result is long enough to include (at least) the rounding bit. 1371 // If there's a non-zero remainder, the corresponding fractional part 1372 // (if it were computed), would have a non-zero sticky bit (if it were 1373 // zero, it couldn't have a non-zero remainder). 1374 var sbit uint 1375 if len(r) > 0 { 1376 sbit = 1 1377 } 1378 1379 z.setExpAndRound(e-fnorm(z.mant), sbit) 1380 } 1381 1382 // ucmp returns -1, 0, or +1, depending on whether 1383 // |x| < |y|, |x| == |y|, or |x| > |y|. 1384 // x and y must have a non-empty mantissa and valid exponent. 1385 func (x *Float) ucmp(y *Float) int { 1386 if debugFloat { 1387 validateBinaryOperands(x, y) 1388 } 1389 1390 switch { 1391 case x.exp < y.exp: 1392 return -1 1393 case x.exp > y.exp: 1394 return +1 1395 } 1396 // x.exp == y.exp 1397 1398 // compare mantissas 1399 i := len(x.mant) 1400 j := len(y.mant) 1401 for i > 0 || j > 0 { 1402 var xm, ym Word 1403 if i > 0 { 1404 i-- 1405 xm = x.mant[i] 1406 } 1407 if j > 0 { 1408 j-- 1409 ym = y.mant[j] 1410 } 1411 switch { 1412 case xm < ym: 1413 return -1 1414 case xm > ym: 1415 return +1 1416 } 1417 } 1418 1419 return 0 1420 } 1421 1422 // Handling of sign bit as defined by IEEE 754-2008, section 6.3: 1423 // 1424 // When neither the inputs nor result are NaN, the sign of a product or 1425 // quotient is the exclusive OR of the operands’ signs; the sign of a sum, 1426 // or of a difference x−y regarded as a sum x+(−y), differs from at most 1427 // one of the addends’ signs; and the sign of the result of conversions, 1428 // the quantize operation, the roundToIntegral operations, and the 1429 // roundToIntegralExact (see 5.3.1) is the sign of the first or only operand. 1430 // These rules shall apply even when operands or results are zero or infinite. 1431 // 1432 // When the sum of two operands with opposite signs (or the difference of 1433 // two operands with like signs) is exactly zero, the sign of that sum (or 1434 // difference) shall be +0 in all rounding-direction attributes except 1435 // roundTowardNegative; under that attribute, the sign of an exact zero 1436 // sum (or difference) shall be −0. However, x+x = x−(−x) retains the same 1437 // sign as x even when x is zero. 1438 // 1439 // See also: https://play.golang.org/p/RtH3UCt5IH 1440 1441 // Add sets z to the rounded sum x+y and returns z. If z's precision is 0, 1442 // it is changed to the larger of x's or y's precision before the operation. 1443 // Rounding is performed according to z's precision and rounding mode; and 1444 // z's accuracy reports the result error relative to the exact (not rounded) 1445 // result. Add panics with [ErrNaN] if x and y are infinities with opposite 1446 // signs. The value of z is undefined in that case. 1447 func (z *Float) Add(x, y *Float) *Float { 1448 if debugFloat { 1449 x.validate() 1450 y.validate() 1451 } 1452 1453 if z.prec == 0 { 1454 z.prec = umax32(x.prec, y.prec) 1455 } 1456 1457 if x.form == finite && y.form == finite { 1458 // x + y (common case) 1459 1460 // Below we set z.neg = x.neg, and when z aliases y this will 1461 // change the y operand's sign. This is fine, because if an 1462 // operand aliases the receiver it'll be overwritten, but we still 1463 // want the original x.neg and y.neg values when we evaluate 1464 // x.neg != y.neg, so we need to save y.neg before setting z.neg. 1465 yneg := y.neg 1466 1467 z.neg = x.neg 1468 if x.neg == yneg { 1469 // x + y == x + y 1470 // (-x) + (-y) == -(x + y) 1471 z.uadd(x, y) 1472 } else { 1473 // x + (-y) == x - y == -(y - x) 1474 // (-x) + y == y - x == -(x - y) 1475 if x.ucmp(y) > 0 { 1476 z.usub(x, y) 1477 } else { 1478 z.neg = !z.neg 1479 z.usub(y, x) 1480 } 1481 } 1482 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { 1483 z.neg = true 1484 } 1485 return z 1486 } 1487 1488 if x.form == inf && y.form == inf && x.neg != y.neg { 1489 // +Inf + -Inf 1490 // -Inf + +Inf 1491 // value of z is undefined but make sure it's valid 1492 z.acc = Exact 1493 z.form = zero 1494 z.neg = false 1495 panic(ErrNaN{"addition of infinities with opposite signs"}) 1496 } 1497 1498 if x.form == zero && y.form == zero { 1499 // ±0 + ±0 1500 z.acc = Exact 1501 z.form = zero 1502 z.neg = x.neg && y.neg // -0 + -0 == -0 1503 return z 1504 } 1505 1506 if x.form == inf || y.form == zero { 1507 // ±Inf + y 1508 // x + ±0 1509 return z.Set(x) 1510 } 1511 1512 // ±0 + y 1513 // x + ±Inf 1514 return z.Set(y) 1515 } 1516 1517 // Sub sets z to the rounded difference x-y and returns z. 1518 // Precision, rounding, and accuracy reporting are as for [Float.Add]. 1519 // Sub panics with [ErrNaN] if x and y are infinities with equal 1520 // signs. The value of z is undefined in that case. 1521 func (z *Float) Sub(x, y *Float) *Float { 1522 if debugFloat { 1523 x.validate() 1524 y.validate() 1525 } 1526 1527 if z.prec == 0 { 1528 z.prec = umax32(x.prec, y.prec) 1529 } 1530 1531 if x.form == finite && y.form == finite { 1532 // x - y (common case) 1533 yneg := y.neg 1534 z.neg = x.neg 1535 if x.neg != yneg { 1536 // x - (-y) == x + y 1537 // (-x) - y == -(x + y) 1538 z.uadd(x, y) 1539 } else { 1540 // x - y == x - y == -(y - x) 1541 // (-x) - (-y) == y - x == -(x - y) 1542 if x.ucmp(y) > 0 { 1543 z.usub(x, y) 1544 } else { 1545 z.neg = !z.neg 1546 z.usub(y, x) 1547 } 1548 } 1549 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { 1550 z.neg = true 1551 } 1552 return z 1553 } 1554 1555 if x.form == inf && y.form == inf && x.neg == y.neg { 1556 // +Inf - +Inf 1557 // -Inf - -Inf 1558 // value of z is undefined but make sure it's valid 1559 z.acc = Exact 1560 z.form = zero 1561 z.neg = false 1562 panic(ErrNaN{"subtraction of infinities with equal signs"}) 1563 } 1564 1565 if x.form == zero && y.form == zero { 1566 // ±0 - ±0 1567 z.acc = Exact 1568 z.form = zero 1569 z.neg = x.neg && !y.neg // -0 - +0 == -0 1570 return z 1571 } 1572 1573 if x.form == inf || y.form == zero { 1574 // ±Inf - y 1575 // x - ±0 1576 return z.Set(x) 1577 } 1578 1579 // ±0 - y 1580 // x - ±Inf 1581 return z.Neg(y) 1582 } 1583 1584 // Mul sets z to the rounded product x*y and returns z. 1585 // Precision, rounding, and accuracy reporting are as for [Float.Add]. 1586 // Mul panics with [ErrNaN] if one operand is zero and the other 1587 // operand an infinity. The value of z is undefined in that case. 1588 func (z *Float) Mul(x, y *Float) *Float { 1589 if debugFloat { 1590 x.validate() 1591 y.validate() 1592 } 1593 1594 if z.prec == 0 { 1595 z.prec = umax32(x.prec, y.prec) 1596 } 1597 1598 z.neg = x.neg != y.neg 1599 1600 if x.form == finite && y.form == finite { 1601 // x * y (common case) 1602 z.umul(x, y) 1603 return z 1604 } 1605 1606 z.acc = Exact 1607 if x.form == zero && y.form == inf || x.form == inf && y.form == zero { 1608 // ±0 * ±Inf 1609 // ±Inf * ±0 1610 // value of z is undefined but make sure it's valid 1611 z.form = zero 1612 z.neg = false 1613 panic(ErrNaN{"multiplication of zero with infinity"}) 1614 } 1615 1616 if x.form == inf || y.form == inf { 1617 // ±Inf * y 1618 // x * ±Inf 1619 z.form = inf 1620 return z 1621 } 1622 1623 // ±0 * y 1624 // x * ±0 1625 z.form = zero 1626 return z 1627 } 1628 1629 // Quo sets z to the rounded quotient x/y and returns z. 1630 // Precision, rounding, and accuracy reporting are as for [Float.Add]. 1631 // Quo panics with [ErrNaN] if both operands are zero or infinities. 1632 // The value of z is undefined in that case. 1633 func (z *Float) Quo(x, y *Float) *Float { 1634 if debugFloat { 1635 x.validate() 1636 y.validate() 1637 } 1638 1639 if z.prec == 0 { 1640 z.prec = umax32(x.prec, y.prec) 1641 } 1642 1643 z.neg = x.neg != y.neg 1644 1645 if x.form == finite && y.form == finite { 1646 // x / y (common case) 1647 z.uquo(x, y) 1648 return z 1649 } 1650 1651 z.acc = Exact 1652 if x.form == zero && y.form == zero || x.form == inf && y.form == inf { 1653 // ±0 / ±0 1654 // ±Inf / ±Inf 1655 // value of z is undefined but make sure it's valid 1656 z.form = zero 1657 z.neg = false 1658 panic(ErrNaN{"division of zero by zero or infinity by infinity"}) 1659 } 1660 1661 if x.form == zero || y.form == inf { 1662 // ±0 / y 1663 // x / ±Inf 1664 z.form = zero 1665 return z 1666 } 1667 1668 // x / ±0 1669 // ±Inf / y 1670 z.form = inf 1671 return z 1672 } 1673 1674 // Cmp compares x and y and returns: 1675 // - -1 if x < y; 1676 // - 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf); 1677 // - +1 if x > y. 1678 func (x *Float) Cmp(y *Float) int { 1679 if debugFloat { 1680 x.validate() 1681 y.validate() 1682 } 1683 1684 mx := x.ord() 1685 my := y.ord() 1686 switch { 1687 case mx < my: 1688 return -1 1689 case mx > my: 1690 return +1 1691 } 1692 // mx == my 1693 1694 // only if |mx| == 1 we have to compare the mantissae 1695 switch mx { 1696 case -1: 1697 return y.ucmp(x) 1698 case +1: 1699 return x.ucmp(y) 1700 } 1701 1702 return 0 1703 } 1704 1705 // ord classifies x and returns: 1706 // 1707 // -2 if -Inf == x 1708 // -1 if -Inf < x < 0 1709 // 0 if x == 0 (signed or unsigned) 1710 // +1 if 0 < x < +Inf 1711 // +2 if x == +Inf 1712 func (x *Float) ord() int { 1713 var m int 1714 switch x.form { 1715 case finite: 1716 m = 1 1717 case zero: 1718 return 0 1719 case inf: 1720 m = 2 1721 } 1722 if x.neg { 1723 m = -m 1724 } 1725 return m 1726 } 1727 1728 func umax32(x, y uint32) uint32 { 1729 if x > y { 1730 return x 1731 } 1732 return y 1733 } 1734