sum_generic.go

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file provides the generic implementation of Sum and MAC. Other files
     6  // might provide optimized assembly implementations of some of this code.
     7  
     8  package poly1305
     9  
    10  import (
    11  	"encoding/binary"
    12  	"math/bits"
    13  )
    14  
    15  // Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
    16  // for a 64 bytes message is approximately
    17  //
    18  //     s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r  mod  2¹³⁰ - 5
    19  //
    20  // for some secret r and s. It can be computed sequentially like
    21  //
    22  //     for len(msg) > 0:
    23  //         h += read(msg, 16)
    24  //         h *= r
    25  //         h %= 2¹³⁰ - 5
    26  //     return h + s
    27  //
    28  // All the complexity is about doing performant constant-time math on numbers
    29  // larger than any available numeric type.
    30  
    31  func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
    32  	h := newMACGeneric(key)
    33  	h.Write(msg)
    34  	h.Sum(out)
    35  }
    36  
    37  func newMACGeneric(key *[32]byte) macGeneric {
    38  	m := macGeneric{}
    39  	initialize(key, &m.macState)
    40  	return m
    41  }
    42  
    43  // macState holds numbers in saturated 64-bit little-endian limbs. That is,
    44  // the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
    45  type macState struct {
    46  	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
    47  	// can grow larger during and after rounds. It must, however, remain below
    48  	// 2 * (2¹³⁰ - 5).
    49  	h [3]uint64
    50  	// r and s are the private key components.
    51  	r [2]uint64
    52  	s [2]uint64
    53  }
    54  
    55  type macGeneric struct {
    56  	macState
    57  
    58  	buffer [TagSize]byte
    59  	offset int
    60  }
    61  
    62  // Write splits the incoming message into TagSize chunks, and passes them to
    63  // update. It buffers incomplete chunks.
    64  func (h *macGeneric) Write(p []byte) (int, error) {
    65  	nn := len(p)
    66  	if h.offset > 0 {
    67  		n := copy(h.buffer[h.offset:], p)
    68  		if h.offset+n < TagSize {
    69  			h.offset += n
    70  			return nn, nil
    71  		}
    72  		p = p[n:]
    73  		h.offset = 0
    74  		updateGeneric(&h.macState, h.buffer[:])
    75  	}
    76  	if n := len(p) - (len(p) % TagSize); n > 0 {
    77  		updateGeneric(&h.macState, p[:n])
    78  		p = p[n:]
    79  	}
    80  	if len(p) > 0 {
    81  		h.offset += copy(h.buffer[h.offset:], p)
    82  	}
    83  	return nn, nil
    84  }
    85  
    86  // Sum flushes the last incomplete chunk from the buffer, if any, and generates
    87  // the MAC output. It does not modify its state, in order to allow for multiple
    88  // calls to Sum, even if no Write is allowed after Sum.
    89  func (h *macGeneric) Sum(out *[TagSize]byte) {
    90  	state := h.macState
    91  	if h.offset > 0 {
    92  		updateGeneric(&state, h.buffer[:h.offset])
    93  	}
    94  	finalize(out, &state.h, &state.s)
    95  }
    96  
    97  // [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
    98  // clears some bits of the secret coefficient to make it possible to implement
    99  // multiplication more efficiently.
   100  const (
   101  	rMask0 = 0x0FFFFFFC0FFFFFFF
   102  	rMask1 = 0x0FFFFFFC0FFFFFFC
   103  )
   104  
   105  // initialize loads the 256-bit key into the two 128-bit secret values r and s.
   106  func initialize(key *[32]byte, m *macState) {
   107  	m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
   108  	m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
   109  	m.s[0] = binary.LittleEndian.Uint64(key[16:24])
   110  	m.s[1] = binary.LittleEndian.Uint64(key[24:32])
   111  }
   112  
   113  // uint128 holds a 128-bit number as two 64-bit limbs, for use with the
   114  // bits.Mul64 and bits.Add64 intrinsics.
   115  type uint128 struct {
   116  	lo, hi uint64
   117  }
   118  
   119  func mul64(a, b uint64) uint128 {
   120  	hi, lo := bits.Mul64(a, b)
   121  	return uint128{lo, hi}
   122  }
   123  
   124  func add128(a, b uint128) uint128 {
   125  	lo, c := bits.Add64(a.lo, b.lo, 0)
   126  	hi, c := bits.Add64(a.hi, b.hi, c)
   127  	if c != 0 {
   128  		panic("poly1305: unexpected overflow")
   129  	}
   130  	return uint128{lo, hi}
   131  }
   132  
   133  func shiftRightBy2(a uint128) uint128 {
   134  	a.lo = a.lo>>2 | (a.hi&3)<<62
   135  	a.hi = a.hi >> 2
   136  	return a
   137  }
   138  
   139  // updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
   140  // 128 bits of message, it computes
   141  //
   142  //	h₊ = (h + m) * r  mod  2¹³⁰ - 5
   143  //
   144  // If the msg length is not a multiple of TagSize, it assumes the last
   145  // incomplete chunk is the final one.
   146  func updateGeneric(state *macState, msg []byte) {
   147  	h0, h1, h2 := state.h[0], state.h[1], state.h[2]
   148  	r0, r1 := state.r[0], state.r[1]
   149  
   150  	for len(msg) > 0 {
   151  		var c uint64
   152  
   153  		// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
   154  		// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
   155  		// reduced at the end of the multiplication below.
   156  		//
   157  		// The spec requires us to set a bit just above the message size, not to
   158  		// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
   159  		// add 1 to the most significant (2¹²⁸) limb, h2.
   160  		if len(msg) >= TagSize {
   161  			h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
   162  			h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
   163  			h2 += c + 1
   164  
   165  			msg = msg[TagSize:]
   166  		} else {
   167  			var buf [TagSize]byte
   168  			copy(buf[:], msg)
   169  			buf[len(msg)] = 1
   170  
   171  			h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
   172  			h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
   173  			h2 += c
   174  
   175  			msg = nil
   176  		}
   177  
   178  		// Multiplication of big number limbs is similar to elementary school
   179  		// columnar multiplication. Instead of digits, there are 64-bit limbs.
   180  		//
   181  		// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
   182  		//
   183  		//                        h2    h1    h0  x
   184  		//                              r1    r0  =
   185  		//                       ----------------
   186  		//                      h2r0  h1r0  h0r0     <-- individual 128-bit products
   187  		//            +   h2r1  h1r1  h0r1
   188  		//               ------------------------
   189  		//                 m3    m2    m1    m0      <-- result in 128-bit overlapping limbs
   190  		//               ------------------------
   191  		//         m3.hi m2.hi m1.hi m0.hi           <-- carry propagation
   192  		//     +         m3.lo m2.lo m1.lo m0.lo
   193  		//        -------------------------------
   194  		//           t4    t3    t2    t1    t0      <-- final result in 64-bit limbs
   195  		//
   196  		// The main difference from pen-and-paper multiplication is that we do
   197  		// carry propagation in a separate step, as if we wrote two digit sums
   198  		// at first (the 128-bit limbs), and then carried the tens all at once.
   199  
   200  		h0r0 := mul64(h0, r0)
   201  		h1r0 := mul64(h1, r0)
   202  		h2r0 := mul64(h2, r0)
   203  		h0r1 := mul64(h0, r1)
   204  		h1r1 := mul64(h1, r1)
   205  		h2r1 := mul64(h2, r1)
   206  
   207  		// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
   208  		// top 4 bits cleared by rMask{0,1}, we know that their product is not going
   209  		// to overflow 64 bits, so we can ignore the high part of the products.
   210  		//
   211  		// This also means that the product doesn't have a fifth limb (t4).
   212  		if h2r0.hi != 0 {
   213  			panic("poly1305: unexpected overflow")
   214  		}
   215  		if h2r1.hi != 0 {
   216  			panic("poly1305: unexpected overflow")
   217  		}
   218  
   219  		m0 := h0r0
   220  		m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
   221  		m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
   222  		m3 := h2r1
   223  
   224  		t0 := m0.lo
   225  		t1, c := bits.Add64(m1.lo, m0.hi, 0)
   226  		t2, c := bits.Add64(m2.lo, m1.hi, c)
   227  		t3, _ := bits.Add64(m3.lo, m2.hi, c)
   228  
   229  		// Now we have the result as 4 64-bit limbs, and we need to reduce it
   230  		// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
   231  		// a cheap partial reduction according to the reduction identity
   232  		//
   233  		//     c * 2¹³⁰ + n  =  c * 5 + n  mod  2¹³⁰ - 5
   234  		//
   235  		// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
   236  		// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
   237  		// assumptions we make about h in the rest of the code.
   238  		//
   239  		// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
   240  
   241  		// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
   242  		// Note that the carry bits are effectively shifted left by 2, in other
   243  		// words, cc = c * 4 for the c in the reduction identity.
   244  		h0, h1, h2 = t0, t1, t2&maskLow2Bits
   245  		cc := uint128{t2 & maskNotLow2Bits, t3}
   246  
   247  		// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.
   248  
   249  		h0, c = bits.Add64(h0, cc.lo, 0)
   250  		h1, c = bits.Add64(h1, cc.hi, c)
   251  		h2 += c
   252  
   253  		cc = shiftRightBy2(cc)
   254  
   255  		h0, c = bits.Add64(h0, cc.lo, 0)
   256  		h1, c = bits.Add64(h1, cc.hi, c)
   257  		h2 += c
   258  
   259  		// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
   260  		//
   261  		//     5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
   262  	}
   263  
   264  	state.h[0], state.h[1], state.h[2] = h0, h1, h2
   265  }
   266  
   267  const (
   268  	maskLow2Bits    uint64 = 0x0000000000000003
   269  	maskNotLow2Bits uint64 = ^maskLow2Bits
   270  )
   271  
   272  // select64 returns x if v == 1 and y if v == 0, in constant time.
   273  func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
   274  
   275  // [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
   276  const (
   277  	p0 = 0xFFFFFFFFFFFFFFFB
   278  	p1 = 0xFFFFFFFFFFFFFFFF
   279  	p2 = 0x0000000000000003
   280  )
   281  
   282  // finalize completes the modular reduction of h and computes
   283  //
   284  //	out = h + s  mod  2¹²⁸
   285  func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
   286  	h0, h1, h2 := h[0], h[1], h[2]
   287  
   288  	// After the partial reduction in updateGeneric, h might be more than
   289  	// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
   290  	// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
   291  	// result if the subtraction underflows, and t otherwise.
   292  
   293  	hMinusP0, b := bits.Sub64(h0, p0, 0)
   294  	hMinusP1, b := bits.Sub64(h1, p1, b)
   295  	_, b = bits.Sub64(h2, p2, b)
   296  
   297  	// h = h if h < p else h - p
   298  	h0 = select64(b, h0, hMinusP0)
   299  	h1 = select64(b, h1, hMinusP1)
   300  
   301  	// Finally, we compute the last Poly1305 step
   302  	//
   303  	//     tag = h + s  mod  2¹²⁸
   304  	//
   305  	// by just doing a wide addition with the 128 low bits of h and discarding
   306  	// the overflow.
   307  	h0, c := bits.Add64(h0, s[0], 0)
   308  	h1, _ = bits.Add64(h1, s[1], c)
   309  
   310  	binary.LittleEndian.PutUint64(out[0:8], h0)
   311  	binary.LittleEndian.PutUint64(out[8:16], h1)
   312  }
   313
View as plain text