feat: Waku v2 bridge

Issue #12610
2023-11-12 13:29:38 +01:00
parent 56e7bd01ca
commit 6d31343205
6716 changed files with 1982502 additions and 5891 deletions
@@ -0,0 +1,246 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package blake2s implements the BLAKE2s hash algorithm defined by RFC 7693
+// and the extendable output function (XOF) BLAKE2Xs.
+//
+// BLAKE2s is optimized for 8- to 32-bit platforms and produces digests of any
+// size between 1 and 32 bytes.
+// For a detailed specification of BLAKE2s see https://blake2.net/blake2.pdf
+// and for BLAKE2Xs see https://blake2.net/blake2x.pdf
+//
+// If you aren't sure which function you need, use BLAKE2s (Sum256 or New256).
+// If you need a secret-key MAC (message authentication code), use the New256
+// function with a non-nil key.
+//
+// BLAKE2X is a construction to compute hash values larger than 32 bytes. It
+// can produce hash values between 0 and 65535 bytes.
+package blake2s // import "golang.org/x/crypto/blake2s"
+
+import (
+	"encoding/binary"
+	"errors"
+	"hash"
+)
+
+const (
+	// The blocksize of BLAKE2s in bytes.
+	BlockSize = 64
+
+	// The hash size of BLAKE2s-256 in bytes.
+	Size = 32
+
+	// The hash size of BLAKE2s-128 in bytes.
+	Size128 = 16
+)
+
+var errKeySize = errors.New("blake2s: invalid key size")
+
+var iv = [8]uint32{
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
+}
+
+// Sum256 returns the BLAKE2s-256 checksum of the data.
+func Sum256(data []byte) [Size]byte {
+	var sum [Size]byte
+	checkSum(&sum, Size, data)
+	return sum
+}
+
+// New256 returns a new hash.Hash computing the BLAKE2s-256 checksum. A non-nil
+// key turns the hash into a MAC. The key must between zero and 32 bytes long.
+// When the key is nil, the returned hash.Hash implements BinaryMarshaler
+// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
+func New256(key []byte) (hash.Hash, error) { return newDigest(Size, key) }
+
+// New128 returns a new hash.Hash computing the BLAKE2s-128 checksum given a
+// non-empty key. Note that a 128-bit digest is too small to be secure as a
+// cryptographic hash and should only be used as a MAC, thus the key argument
+// is not optional.
+func New128(key []byte) (hash.Hash, error) {
+	if len(key) == 0 {
+		return nil, errors.New("blake2s: a key is required for a 128-bit hash")
+	}
+	return newDigest(Size128, key)
+}
+
+func newDigest(hashSize int, key []byte) (*digest, error) {
+	if len(key) > Size {
+		return nil, errKeySize
+	}
+	d := &digest{
+		size:   hashSize,
+		keyLen: len(key),
+	}
+	copy(d.key[:], key)
+	d.Reset()
+	return d, nil
+}
+
+func checkSum(sum *[Size]byte, hashSize int, data []byte) {
+	var (
+		h [8]uint32
+		c [2]uint32
+	)
+
+	h = iv
+	h[0] ^= uint32(hashSize) | (1 << 16) | (1 << 24)
+
+	if length := len(data); length > BlockSize {
+		n := length &^ (BlockSize - 1)
+		if length == n {
+			n -= BlockSize
+		}
+		hashBlocks(&h, &c, 0, data[:n])
+		data = data[n:]
+	}
+
+	var block [BlockSize]byte
+	offset := copy(block[:], data)
+	remaining := uint32(BlockSize - offset)
+
+	if c[0] < remaining {
+		c[1]--
+	}
+	c[0] -= remaining
+
+	hashBlocks(&h, &c, 0xFFFFFFFF, block[:])
+
+	for i, v := range h {
+		binary.LittleEndian.PutUint32(sum[4*i:], v)
+	}
+}
+
+type digest struct {
+	h      [8]uint32
+	c      [2]uint32
+	size   int
+	block  [BlockSize]byte
+	offset int
+
+	key    [BlockSize]byte
+	keyLen int
+}
+
+const (
+	magic         = "b2s"
+	marshaledSize = len(magic) + 8*4 + 2*4 + 1 + BlockSize + 1
+)
+
+func (d *digest) MarshalBinary() ([]byte, error) {
+	if d.keyLen != 0 {
+		return nil, errors.New("crypto/blake2s: cannot marshal MACs")
+	}
+	b := make([]byte, 0, marshaledSize)
+	b = append(b, magic...)
+	for i := 0; i < 8; i++ {
+		b = appendUint32(b, d.h[i])
+	}
+	b = appendUint32(b, d.c[0])
+	b = appendUint32(b, d.c[1])
+	// Maximum value for size is 32
+	b = append(b, byte(d.size))
+	b = append(b, d.block[:]...)
+	b = append(b, byte(d.offset))
+	return b, nil
+}
+
+func (d *digest) UnmarshalBinary(b []byte) error {
+	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
+		return errors.New("crypto/blake2s: invalid hash state identifier")
+	}
+	if len(b) != marshaledSize {
+		return errors.New("crypto/blake2s: invalid hash state size")
+	}
+	b = b[len(magic):]
+	for i := 0; i < 8; i++ {
+		b, d.h[i] = consumeUint32(b)
+	}
+	b, d.c[0] = consumeUint32(b)
+	b, d.c[1] = consumeUint32(b)
+	d.size = int(b[0])
+	b = b[1:]
+	copy(d.block[:], b[:BlockSize])
+	b = b[BlockSize:]
+	d.offset = int(b[0])
+	return nil
+}
+
+func (d *digest) BlockSize() int { return BlockSize }
+
+func (d *digest) Size() int { return d.size }
+
+func (d *digest) Reset() {
+	d.h = iv
+	d.h[0] ^= uint32(d.size) | (uint32(d.keyLen) << 8) | (1 << 16) | (1 << 24)
+	d.offset, d.c[0], d.c[1] = 0, 0, 0
+	if d.keyLen > 0 {
+		d.block = d.key
+		d.offset = BlockSize
+	}
+}
+
+func (d *digest) Write(p []byte) (n int, err error) {
+	n = len(p)
+
+	if d.offset > 0 {
+		remaining := BlockSize - d.offset
+		if n <= remaining {
+			d.offset += copy(d.block[d.offset:], p)
+			return
+		}
+		copy(d.block[d.offset:], p[:remaining])
+		hashBlocks(&d.h, &d.c, 0, d.block[:])
+		d.offset = 0
+		p = p[remaining:]
+	}
+
+	if length := len(p); length > BlockSize {
+		nn := length &^ (BlockSize - 1)
+		if length == nn {
+			nn -= BlockSize
+		}
+		hashBlocks(&d.h, &d.c, 0, p[:nn])
+		p = p[nn:]
+	}
+
+	d.offset += copy(d.block[:], p)
+	return
+}
+
+func (d *digest) Sum(sum []byte) []byte {
+	var hash [Size]byte
+	d.finalize(&hash)
+	return append(sum, hash[:d.size]...)
+}
+
+func (d *digest) finalize(hash *[Size]byte) {
+	var block [BlockSize]byte
+	h := d.h
+	c := d.c
+
+	copy(block[:], d.block[:d.offset])
+	remaining := uint32(BlockSize - d.offset)
+	if c[0] < remaining {
+		c[1]--
+	}
+	c[0] -= remaining
+
+	hashBlocks(&h, &c, 0xFFFFFFFF, block[:])
+	for i, v := range h {
+		binary.LittleEndian.PutUint32(hash[4*i:], v)
+	}
+}
+
+func appendUint32(b []byte, x uint32) []byte {
+	var a [4]byte
+	binary.BigEndian.PutUint32(a[:], x)
+	return append(b, a[:]...)
+}
+
+func consumeUint32(b []byte) ([]byte, uint32) {
+	x := binary.BigEndian.Uint32(b)
+	return b[4:], x
+}
@@ -0,0 +1,33 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 && gc && !purego
+// +build 386,gc,!purego
+
+package blake2s
+
+import "golang.org/x/sys/cpu"
+
+var (
+	useSSE4  = false
+	useSSSE3 = cpu.X86.HasSSSE3
+	useSSE2  = cpu.X86.HasSSE2
+)
+
+//go:noescape
+func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+
+//go:noescape
+func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+
+func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
+	switch {
+	case useSSSE3:
+		hashBlocksSSSE3(h, c, flag, blocks)
+	case useSSE2:
+		hashBlocksSSE2(h, c, flag, blocks)
+	default:
+		hashBlocksGeneric(h, c, flag, blocks)
+	}
+}
@@ -0,0 +1,430 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 && gc && !purego
+// +build 386,gc,!purego
+
+#include "textflag.h"
+
+DATA iv0<>+0x00(SB)/4, $0x6a09e667
+DATA iv0<>+0x04(SB)/4, $0xbb67ae85
+DATA iv0<>+0x08(SB)/4, $0x3c6ef372
+DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
+GLOBL iv0<>(SB), (NOPTR+RODATA), $16
+
+DATA iv1<>+0x00(SB)/4, $0x510e527f
+DATA iv1<>+0x04(SB)/4, $0x9b05688c
+DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
+DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
+GLOBL iv1<>(SB), (NOPTR+RODATA), $16
+
+DATA rol16<>+0x00(SB)/8, $0x0504070601000302
+DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
+GLOBL rol16<>(SB), (NOPTR+RODATA), $16
+
+DATA rol8<>+0x00(SB)/8, $0x0407060500030201
+DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
+GLOBL rol8<>(SB), (NOPTR+RODATA), $16
+
+DATA counter<>+0x00(SB)/8, $0x40
+DATA counter<>+0x08(SB)/8, $0x0
+GLOBL counter<>(SB), (NOPTR+RODATA), $16
+
+#define ROTL_SSE2(n, t, v) \
+	MOVO  v, t;       \
+	PSLLL $n, t;      \
+	PSRLL $(32-n), v; \
+	PXOR  t, v
+
+#define ROTL_SSSE3(c, v) \
+	PSHUFB c, v
+
+#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
+	PADDL  m0, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(16, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m1, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(24, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v1, v1; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v3, v3; \
+	PADDL  m2, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(16, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m3, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(24, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v3, v3; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v1, v1
+
+#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
+	PADDL  m0, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c16, v3);  \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m1, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c8, v3);   \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v1, v1; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v3, v3; \
+	PADDL  m2, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c16, v3);  \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m3, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c8, v3);   \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v3, v3; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v1, v1
+
+#define PRECOMPUTE(dst, off, src, t) \
+	MOVL 0*4(src), t;          \
+	MOVL t, 0*4+off+0(dst);    \
+	MOVL t, 9*4+off+64(dst);   \
+	MOVL t, 5*4+off+128(dst);  \
+	MOVL t, 14*4+off+192(dst); \
+	MOVL t, 4*4+off+256(dst);  \
+	MOVL t, 2*4+off+320(dst);  \
+	MOVL t, 8*4+off+384(dst);  \
+	MOVL t, 12*4+off+448(dst); \
+	MOVL t, 3*4+off+512(dst);  \
+	MOVL t, 15*4+off+576(dst); \
+	MOVL 1*4(src), t;          \
+	MOVL t, 4*4+off+0(dst);    \
+	MOVL t, 8*4+off+64(dst);   \
+	MOVL t, 14*4+off+128(dst); \
+	MOVL t, 5*4+off+192(dst);  \
+	MOVL t, 12*4+off+256(dst); \
+	MOVL t, 11*4+off+320(dst); \
+	MOVL t, 1*4+off+384(dst);  \
+	MOVL t, 6*4+off+448(dst);  \
+	MOVL t, 10*4+off+512(dst); \
+	MOVL t, 3*4+off+576(dst);  \
+	MOVL 2*4(src), t;          \
+	MOVL t, 1*4+off+0(dst);    \
+	MOVL t, 13*4+off+64(dst);  \
+	MOVL t, 6*4+off+128(dst);  \
+	MOVL t, 8*4+off+192(dst);  \
+	MOVL t, 2*4+off+256(dst);  \
+	MOVL t, 0*4+off+320(dst);  \
+	MOVL t, 14*4+off+384(dst); \
+	MOVL t, 11*4+off+448(dst); \
+	MOVL t, 12*4+off+512(dst); \
+	MOVL t, 4*4+off+576(dst);  \
+	MOVL 3*4(src), t;          \
+	MOVL t, 5*4+off+0(dst);    \
+	MOVL t, 15*4+off+64(dst);  \
+	MOVL t, 9*4+off+128(dst);  \
+	MOVL t, 1*4+off+192(dst);  \
+	MOVL t, 11*4+off+256(dst); \
+	MOVL t, 7*4+off+320(dst);  \
+	MOVL t, 13*4+off+384(dst); \
+	MOVL t, 3*4+off+448(dst);  \
+	MOVL t, 6*4+off+512(dst);  \
+	MOVL t, 10*4+off+576(dst); \
+	MOVL 4*4(src), t;          \
+	MOVL t, 2*4+off+0(dst);    \
+	MOVL t, 1*4+off+64(dst);   \
+	MOVL t, 15*4+off+128(dst); \
+	MOVL t, 10*4+off+192(dst); \
+	MOVL t, 6*4+off+256(dst);  \
+	MOVL t, 8*4+off+320(dst);  \
+	MOVL t, 3*4+off+384(dst);  \
+	MOVL t, 13*4+off+448(dst); \
+	MOVL t, 14*4+off+512(dst); \
+	MOVL t, 5*4+off+576(dst);  \
+	MOVL 5*4(src), t;          \
+	MOVL t, 6*4+off+0(dst);    \
+	MOVL t, 11*4+off+64(dst);  \
+	MOVL t, 2*4+off+128(dst);  \
+	MOVL t, 9*4+off+192(dst);  \
+	MOVL t, 1*4+off+256(dst);  \
+	MOVL t, 13*4+off+320(dst); \
+	MOVL t, 4*4+off+384(dst);  \
+	MOVL t, 8*4+off+448(dst);  \
+	MOVL t, 15*4+off+512(dst); \
+	MOVL t, 7*4+off+576(dst);  \
+	MOVL 6*4(src), t;          \
+	MOVL t, 3*4+off+0(dst);    \
+	MOVL t, 7*4+off+64(dst);   \
+	MOVL t, 13*4+off+128(dst); \
+	MOVL t, 12*4+off+192(dst); \
+	MOVL t, 10*4+off+256(dst); \
+	MOVL t, 1*4+off+320(dst);  \
+	MOVL t, 9*4+off+384(dst);  \
+	MOVL t, 14*4+off+448(dst); \
+	MOVL t, 0*4+off+512(dst);  \
+	MOVL t, 6*4+off+576(dst);  \
+	MOVL 7*4(src), t;          \
+	MOVL t, 7*4+off+0(dst);    \
+	MOVL t, 14*4+off+64(dst);  \
+	MOVL t, 10*4+off+128(dst); \
+	MOVL t, 0*4+off+192(dst);  \
+	MOVL t, 5*4+off+256(dst);  \
+	MOVL t, 9*4+off+320(dst);  \
+	MOVL t, 12*4+off+384(dst); \
+	MOVL t, 1*4+off+448(dst);  \
+	MOVL t, 13*4+off+512(dst); \
+	MOVL t, 2*4+off+576(dst);  \
+	MOVL 8*4(src), t;          \
+	MOVL t, 8*4+off+0(dst);    \
+	MOVL t, 5*4+off+64(dst);   \
+	MOVL t, 4*4+off+128(dst);  \
+	MOVL t, 15*4+off+192(dst); \
+	MOVL t, 14*4+off+256(dst); \
+	MOVL t, 3*4+off+320(dst);  \
+	MOVL t, 11*4+off+384(dst); \
+	MOVL t, 10*4+off+448(dst); \
+	MOVL t, 7*4+off+512(dst);  \
+	MOVL t, 1*4+off+576(dst);  \
+	MOVL 9*4(src), t;          \
+	MOVL t, 12*4+off+0(dst);   \
+	MOVL t, 2*4+off+64(dst);   \
+	MOVL t, 11*4+off+128(dst); \
+	MOVL t, 4*4+off+192(dst);  \
+	MOVL t, 0*4+off+256(dst);  \
+	MOVL t, 15*4+off+320(dst); \
+	MOVL t, 10*4+off+384(dst); \
+	MOVL t, 7*4+off+448(dst);  \
+	MOVL t, 5*4+off+512(dst);  \
+	MOVL t, 9*4+off+576(dst);  \
+	MOVL 10*4(src), t;         \
+	MOVL t, 9*4+off+0(dst);    \
+	MOVL t, 4*4+off+64(dst);   \
+	MOVL t, 8*4+off+128(dst);  \
+	MOVL t, 13*4+off+192(dst); \
+	MOVL t, 3*4+off+256(dst);  \
+	MOVL t, 5*4+off+320(dst);  \
+	MOVL t, 7*4+off+384(dst);  \
+	MOVL t, 15*4+off+448(dst); \
+	MOVL t, 11*4+off+512(dst); \
+	MOVL t, 0*4+off+576(dst);  \
+	MOVL 11*4(src), t;         \
+	MOVL t, 13*4+off+0(dst);   \
+	MOVL t, 10*4+off+64(dst);  \
+	MOVL t, 0*4+off+128(dst);  \
+	MOVL t, 3*4+off+192(dst);  \
+	MOVL t, 9*4+off+256(dst);  \
+	MOVL t, 6*4+off+320(dst);  \
+	MOVL t, 15*4+off+384(dst); \
+	MOVL t, 4*4+off+448(dst);  \
+	MOVL t, 2*4+off+512(dst);  \
+	MOVL t, 12*4+off+576(dst); \
+	MOVL 12*4(src), t;         \
+	MOVL t, 10*4+off+0(dst);   \
+	MOVL t, 12*4+off+64(dst);  \
+	MOVL t, 1*4+off+128(dst);  \
+	MOVL t, 6*4+off+192(dst);  \
+	MOVL t, 13*4+off+256(dst); \
+	MOVL t, 4*4+off+320(dst);  \
+	MOVL t, 0*4+off+384(dst);  \
+	MOVL t, 2*4+off+448(dst);  \
+	MOVL t, 8*4+off+512(dst);  \
+	MOVL t, 14*4+off+576(dst); \
+	MOVL 13*4(src), t;         \
+	MOVL t, 14*4+off+0(dst);   \
+	MOVL t, 3*4+off+64(dst);   \
+	MOVL t, 7*4+off+128(dst);  \
+	MOVL t, 2*4+off+192(dst);  \
+	MOVL t, 15*4+off+256(dst); \
+	MOVL t, 12*4+off+320(dst); \
+	MOVL t, 6*4+off+384(dst);  \
+	MOVL t, 0*4+off+448(dst);  \
+	MOVL t, 9*4+off+512(dst);  \
+	MOVL t, 11*4+off+576(dst); \
+	MOVL 14*4(src), t;         \
+	MOVL t, 11*4+off+0(dst);   \
+	MOVL t, 0*4+off+64(dst);   \
+	MOVL t, 12*4+off+128(dst); \
+	MOVL t, 7*4+off+192(dst);  \
+	MOVL t, 8*4+off+256(dst);  \
+	MOVL t, 14*4+off+320(dst); \
+	MOVL t, 2*4+off+384(dst);  \
+	MOVL t, 5*4+off+448(dst);  \
+	MOVL t, 1*4+off+512(dst);  \
+	MOVL t, 13*4+off+576(dst); \
+	MOVL 15*4(src), t;         \
+	MOVL t, 15*4+off+0(dst);   \
+	MOVL t, 6*4+off+64(dst);   \
+	MOVL t, 3*4+off+128(dst);  \
+	MOVL t, 11*4+off+192(dst); \
+	MOVL t, 7*4+off+256(dst);  \
+	MOVL t, 10*4+off+320(dst); \
+	MOVL t, 5*4+off+384(dst);  \
+	MOVL t, 9*4+off+448(dst);  \
+	MOVL t, 4*4+off+512(dst);  \
+	MOVL t, 8*4+off+576(dst)
+
+// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
+	MOVL h+0(FP), AX
+	MOVL c+4(FP), BX
+	MOVL flag+8(FP), CX
+	MOVL blocks_base+12(FP), SI
+	MOVL blocks_len+16(FP), DX
+
+	MOVL SP, DI
+	ADDL $15, DI
+	ANDL $~15, DI
+
+	MOVL CX, 8(DI)
+	MOVL 0(BX), CX
+	MOVL CX, 0(DI)
+	MOVL 4(BX), CX
+	MOVL CX, 4(DI)
+	XORL CX, CX
+	MOVL CX, 12(DI)
+
+	MOVOU 0(AX), X0
+	MOVOU 16(AX), X1
+	MOVOU counter<>(SB), X2
+
+loop:
+	MOVO  X0, X4
+	MOVO  X1, X5
+	MOVOU iv0<>(SB), X6
+	MOVOU iv1<>(SB), X7
+
+	MOVO  0(DI), X3
+	PADDQ X2, X3
+	PXOR  X3, X7
+	MOVO  X3, 0(DI)
+
+	PRECOMPUTE(DI, 16, SI, CX)
+	ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3)
+	ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3)
+
+	PXOR X4, X0
+	PXOR X5, X1
+	PXOR X6, X0
+	PXOR X7, X1
+
+	LEAL 64(SI), SI
+	SUBL $64, DX
+	JNE  loop
+
+	MOVL 0(DI), CX
+	MOVL CX, 0(BX)
+	MOVL 4(DI), CX
+	MOVL CX, 4(BX)
+
+	MOVOU X0, 0(AX)
+	MOVOU X1, 16(AX)
+
+	RET
+
+// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
+	MOVL h+0(FP), AX
+	MOVL c+4(FP), BX
+	MOVL flag+8(FP), CX
+	MOVL blocks_base+12(FP), SI
+	MOVL blocks_len+16(FP), DX
+
+	MOVL SP, DI
+	ADDL $15, DI
+	ANDL $~15, DI
+
+	MOVL CX, 8(DI)
+	MOVL 0(BX), CX
+	MOVL CX, 0(DI)
+	MOVL 4(BX), CX
+	MOVL CX, 4(DI)
+	XORL CX, CX
+	MOVL CX, 12(DI)
+
+	MOVOU 0(AX), X0
+	MOVOU 16(AX), X1
+	MOVOU counter<>(SB), X2
+
+loop:
+	MOVO  X0, 656(DI)
+	MOVO  X1, 672(DI)
+	MOVO  X0, X4
+	MOVO  X1, X5
+	MOVOU iv0<>(SB), X6
+	MOVOU iv1<>(SB), X7
+
+	MOVO  0(DI), X3
+	PADDQ X2, X3
+	PXOR  X3, X7
+	MOVO  X3, 0(DI)
+
+	MOVOU rol16<>(SB), X0
+	MOVOU rol8<>(SB), X1
+
+	PRECOMPUTE(DI, 16, SI, CX)
+	ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1)
+	ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1)
+
+	MOVO 656(DI), X0
+	MOVO 672(DI), X1
+	PXOR X4, X0
+	PXOR X5, X1
+	PXOR X6, X0
+	PXOR X7, X1
+
+	LEAL 64(SI), SI
+	SUBL $64, DX
+	JNE  loop
+
+	MOVL 0(DI), CX
+	MOVL CX, 0(BX)
+	MOVL 4(DI), CX
+	MOVL CX, 4(BX)
+
+	MOVOU X0, 0(AX)
+	MOVOU X1, 16(AX)
+
+	RET
@@ -0,0 +1,38 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && gc && !purego
+// +build amd64,gc,!purego
+
+package blake2s
+
+import "golang.org/x/sys/cpu"
+
+var (
+	useSSE4  = cpu.X86.HasSSE41
+	useSSSE3 = cpu.X86.HasSSSE3
+	useSSE2  = cpu.X86.HasSSE2
+)
+
+//go:noescape
+func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+
+//go:noescape
+func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+
+//go:noescape
+func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+
+func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
+	switch {
+	case useSSE4:
+		hashBlocksSSE4(h, c, flag, blocks)
+	case useSSSE3:
+		hashBlocksSSSE3(h, c, flag, blocks)
+	case useSSE2:
+		hashBlocksSSE2(h, c, flag, blocks)
+	default:
+		hashBlocksGeneric(h, c, flag, blocks)
+	}
+}
@@ -0,0 +1,433 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && gc && !purego
+// +build amd64,gc,!purego
+
+#include "textflag.h"
+
+DATA iv0<>+0x00(SB)/4, $0x6a09e667
+DATA iv0<>+0x04(SB)/4, $0xbb67ae85
+DATA iv0<>+0x08(SB)/4, $0x3c6ef372
+DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
+GLOBL iv0<>(SB), (NOPTR+RODATA), $16
+
+DATA iv1<>+0x00(SB)/4, $0x510e527f
+DATA iv1<>+0x04(SB)/4, $0x9b05688c
+DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
+DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
+GLOBL iv1<>(SB), (NOPTR+RODATA), $16
+
+DATA rol16<>+0x00(SB)/8, $0x0504070601000302
+DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
+GLOBL rol16<>(SB), (NOPTR+RODATA), $16
+
+DATA rol8<>+0x00(SB)/8, $0x0407060500030201
+DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
+GLOBL rol8<>(SB), (NOPTR+RODATA), $16
+
+DATA counter<>+0x00(SB)/8, $0x40
+DATA counter<>+0x08(SB)/8, $0x0
+GLOBL counter<>(SB), (NOPTR+RODATA), $16
+
+#define ROTL_SSE2(n, t, v) \
+	MOVO  v, t;       \
+	PSLLL $n, t;      \
+	PSRLL $(32-n), v; \
+	PXOR  t, v
+
+#define ROTL_SSSE3(c, v) \
+	PSHUFB c, v
+
+#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
+	PADDL  m0, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(16, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m1, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(24, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v1, v1; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v3, v3; \
+	PADDL  m2, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(16, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m3, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSE2(24, t, v3); \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v3, v3; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v1, v1
+
+#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
+	PADDL  m0, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c16, v3);  \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m1, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c8, v3);   \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v1, v1; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v3, v3; \
+	PADDL  m2, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c16, v3);  \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(20, t, v1); \
+	PADDL  m3, v0;        \
+	PADDL  v1, v0;        \
+	PXOR   v0, v3;        \
+	ROTL_SSSE3(c8, v3);   \
+	PADDL  v3, v2;        \
+	PXOR   v2, v1;        \
+	ROTL_SSE2(25, t, v1); \
+	PSHUFL $0x39, v3, v3; \
+	PSHUFL $0x4E, v2, v2; \
+	PSHUFL $0x93, v1, v1
+
+
+#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \
+	MOVL   i0*4(src), m0;      \
+	PINSRD $1, i1*4(src), m0;  \
+	PINSRD $2, i2*4(src), m0;  \
+	PINSRD $3, i3*4(src), m0;  \
+	MOVL   i4*4(src), m1;      \
+	PINSRD $1, i5*4(src), m1;  \
+	PINSRD $2, i6*4(src), m1;  \
+	PINSRD $3, i7*4(src), m1;  \
+	MOVL   i8*4(src), m2;      \
+	PINSRD $1, i9*4(src), m2;  \
+	PINSRD $2, i10*4(src), m2; \
+	PINSRD $3, i11*4(src), m2; \
+	MOVL   i12*4(src), m3;     \
+	PINSRD $1, i13*4(src), m3; \
+	PINSRD $2, i14*4(src), m3; \
+	PINSRD $3, i15*4(src), m3
+
+#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \
+	MOVQ 0*4(src), R8;           \
+	MOVQ 2*4(src), R9;           \
+	MOVQ 4*4(src), R10;          \
+	MOVQ 6*4(src), R11;          \
+	MOVQ 8*4(src), R12;          \
+	MOVQ 10*4(src), R13;         \
+	MOVQ 12*4(src), R14;         \
+	MOVQ 14*4(src), R15;         \
+	                             \
+	MOVL R8, 0*4+off+0(dst);     \
+	MOVL R8, 9*4+off+64(dst);    \
+	MOVL R8, 5*4+off+128(dst);   \
+	MOVL R8, 14*4+off+192(dst);  \
+	MOVL R8, 4*4+off+256(dst);   \
+	MOVL R8, 2*4+off+320(dst);   \
+	MOVL R8, 8*4+off+384(dst);   \
+	MOVL R8, 12*4+off+448(dst);  \
+	MOVL R8, 3*4+off+512(dst);   \
+	MOVL R8, 15*4+off+576(dst);  \
+	SHRQ $32, R8;                \
+	MOVL R8, 4*4+off+0(dst);     \
+	MOVL R8, 8*4+off+64(dst);    \
+	MOVL R8, 14*4+off+128(dst);  \
+	MOVL R8, 5*4+off+192(dst);   \
+	MOVL R8, 12*4+off+256(dst);  \
+	MOVL R8, 11*4+off+320(dst);  \
+	MOVL R8, 1*4+off+384(dst);   \
+	MOVL R8, 6*4+off+448(dst);   \
+	MOVL R8, 10*4+off+512(dst);  \
+	MOVL R8, 3*4+off+576(dst);   \
+	                             \
+	MOVL R9, 1*4+off+0(dst);     \
+	MOVL R9, 13*4+off+64(dst);   \
+	MOVL R9, 6*4+off+128(dst);   \
+	MOVL R9, 8*4+off+192(dst);   \
+	MOVL R9, 2*4+off+256(dst);   \
+	MOVL R9, 0*4+off+320(dst);   \
+	MOVL R9, 14*4+off+384(dst);  \
+	MOVL R9, 11*4+off+448(dst);  \
+	MOVL R9, 12*4+off+512(dst);  \
+	MOVL R9, 4*4+off+576(dst);   \
+	SHRQ $32, R9;                \
+	MOVL R9, 5*4+off+0(dst);     \
+	MOVL R9, 15*4+off+64(dst);   \
+	MOVL R9, 9*4+off+128(dst);   \
+	MOVL R9, 1*4+off+192(dst);   \
+	MOVL R9, 11*4+off+256(dst);  \
+	MOVL R9, 7*4+off+320(dst);   \
+	MOVL R9, 13*4+off+384(dst);  \
+	MOVL R9, 3*4+off+448(dst);   \
+	MOVL R9, 6*4+off+512(dst);   \
+	MOVL R9, 10*4+off+576(dst);  \
+	                             \
+	MOVL R10, 2*4+off+0(dst);    \
+	MOVL R10, 1*4+off+64(dst);   \
+	MOVL R10, 15*4+off+128(dst); \
+	MOVL R10, 10*4+off+192(dst); \
+	MOVL R10, 6*4+off+256(dst);  \
+	MOVL R10, 8*4+off+320(dst);  \
+	MOVL R10, 3*4+off+384(dst);  \
+	MOVL R10, 13*4+off+448(dst); \
+	MOVL R10, 14*4+off+512(dst); \
+	MOVL R10, 5*4+off+576(dst);  \
+	SHRQ $32, R10;               \
+	MOVL R10, 6*4+off+0(dst);    \
+	MOVL R10, 11*4+off+64(dst);  \
+	MOVL R10, 2*4+off+128(dst);  \
+	MOVL R10, 9*4+off+192(dst);  \
+	MOVL R10, 1*4+off+256(dst);  \
+	MOVL R10, 13*4+off+320(dst); \
+	MOVL R10, 4*4+off+384(dst);  \
+	MOVL R10, 8*4+off+448(dst);  \
+	MOVL R10, 15*4+off+512(dst); \
+	MOVL R10, 7*4+off+576(dst);  \
+	                             \
+	MOVL R11, 3*4+off+0(dst);    \
+	MOVL R11, 7*4+off+64(dst);   \
+	MOVL R11, 13*4+off+128(dst); \
+	MOVL R11, 12*4+off+192(dst); \
+	MOVL R11, 10*4+off+256(dst); \
+	MOVL R11, 1*4+off+320(dst);  \
+	MOVL R11, 9*4+off+384(dst);  \
+	MOVL R11, 14*4+off+448(dst); \
+	MOVL R11, 0*4+off+512(dst);  \
+	MOVL R11, 6*4+off+576(dst);  \
+	SHRQ $32, R11;               \
+	MOVL R11, 7*4+off+0(dst);    \
+	MOVL R11, 14*4+off+64(dst);  \
+	MOVL R11, 10*4+off+128(dst); \
+	MOVL R11, 0*4+off+192(dst);  \
+	MOVL R11, 5*4+off+256(dst);  \
+	MOVL R11, 9*4+off+320(dst);  \
+	MOVL R11, 12*4+off+384(dst); \
+	MOVL R11, 1*4+off+448(dst);  \
+	MOVL R11, 13*4+off+512(dst); \
+	MOVL R11, 2*4+off+576(dst);  \
+	                             \
+	MOVL R12, 8*4+off+0(dst);    \
+	MOVL R12, 5*4+off+64(dst);   \
+	MOVL R12, 4*4+off+128(dst);  \
+	MOVL R12, 15*4+off+192(dst); \
+	MOVL R12, 14*4+off+256(dst); \
+	MOVL R12, 3*4+off+320(dst);  \
+	MOVL R12, 11*4+off+384(dst); \
+	MOVL R12, 10*4+off+448(dst); \
+	MOVL R12, 7*4+off+512(dst);  \
+	MOVL R12, 1*4+off+576(dst);  \
+	SHRQ $32, R12;               \
+	MOVL R12, 12*4+off+0(dst);   \
+	MOVL R12, 2*4+off+64(dst);   \
+	MOVL R12, 11*4+off+128(dst); \
+	MOVL R12, 4*4+off+192(dst);  \
+	MOVL R12, 0*4+off+256(dst);  \
+	MOVL R12, 15*4+off+320(dst); \
+	MOVL R12, 10*4+off+384(dst); \
+	MOVL R12, 7*4+off+448(dst);  \
+	MOVL R12, 5*4+off+512(dst);  \
+	MOVL R12, 9*4+off+576(dst);  \
+	                             \
+	MOVL R13, 9*4+off+0(dst);    \
+	MOVL R13, 4*4+off+64(dst);   \
+	MOVL R13, 8*4+off+128(dst);  \
+	MOVL R13, 13*4+off+192(dst); \
+	MOVL R13, 3*4+off+256(dst);  \
+	MOVL R13, 5*4+off+320(dst);  \
+	MOVL R13, 7*4+off+384(dst);  \
+	MOVL R13, 15*4+off+448(dst); \
+	MOVL R13, 11*4+off+512(dst); \
+	MOVL R13, 0*4+off+576(dst);  \
+	SHRQ $32, R13;               \
+	MOVL R13, 13*4+off+0(dst);   \
+	MOVL R13, 10*4+off+64(dst);  \
+	MOVL R13, 0*4+off+128(dst);  \
+	MOVL R13, 3*4+off+192(dst);  \
+	MOVL R13, 9*4+off+256(dst);  \
+	MOVL R13, 6*4+off+320(dst);  \
+	MOVL R13, 15*4+off+384(dst); \
+	MOVL R13, 4*4+off+448(dst);  \
+	MOVL R13, 2*4+off+512(dst);  \
+	MOVL R13, 12*4+off+576(dst); \
+	                             \
+	MOVL R14, 10*4+off+0(dst);   \
+	MOVL R14, 12*4+off+64(dst);  \
+	MOVL R14, 1*4+off+128(dst);  \
+	MOVL R14, 6*4+off+192(dst);  \
+	MOVL R14, 13*4+off+256(dst); \
+	MOVL R14, 4*4+off+320(dst);  \
+	MOVL R14, 0*4+off+384(dst);  \
+	MOVL R14, 2*4+off+448(dst);  \
+	MOVL R14, 8*4+off+512(dst);  \
+	MOVL R14, 14*4+off+576(dst); \
+	SHRQ $32, R14;               \
+	MOVL R14, 14*4+off+0(dst);   \
+	MOVL R14, 3*4+off+64(dst);   \
+	MOVL R14, 7*4+off+128(dst);  \
+	MOVL R14, 2*4+off+192(dst);  \
+	MOVL R14, 15*4+off+256(dst); \
+	MOVL R14, 12*4+off+320(dst); \
+	MOVL R14, 6*4+off+384(dst);  \
+	MOVL R14, 0*4+off+448(dst);  \
+	MOVL R14, 9*4+off+512(dst);  \
+	MOVL R14, 11*4+off+576(dst); \
+	                             \
+	MOVL R15, 11*4+off+0(dst);   \
+	MOVL R15, 0*4+off+64(dst);   \
+	MOVL R15, 12*4+off+128(dst); \
+	MOVL R15, 7*4+off+192(dst);  \
+	MOVL R15, 8*4+off+256(dst);  \
+	MOVL R15, 14*4+off+320(dst); \
+	MOVL R15, 2*4+off+384(dst);  \
+	MOVL R15, 5*4+off+448(dst);  \
+	MOVL R15, 1*4+off+512(dst);  \
+	MOVL R15, 13*4+off+576(dst); \
+	SHRQ $32, R15;               \
+	MOVL R15, 15*4+off+0(dst);   \
+	MOVL R15, 6*4+off+64(dst);   \
+	MOVL R15, 3*4+off+128(dst);  \
+	MOVL R15, 11*4+off+192(dst); \
+	MOVL R15, 7*4+off+256(dst);  \
+	MOVL R15, 10*4+off+320(dst); \
+	MOVL R15, 5*4+off+384(dst);  \
+	MOVL R15, 9*4+off+448(dst);  \
+	MOVL R15, 4*4+off+512(dst);  \
+	MOVL R15, 8*4+off+576(dst)
+
+#define BLAKE2s_SSE2() \
+	PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15);               \
+	ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8);                 \
+	ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8);     \
+	ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \
+	ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \
+	ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \
+	ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \
+	ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \
+	ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \
+	ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \
+	ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8)
+
+#define BLAKE2s_SSSE3() \
+	PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15);                          \
+	ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14);                 \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14);     \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \
+	ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14)
+
+#define BLAKE2s_SSE4() \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
+	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \
+	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14)
+
+#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \
+	MOVQ  h, AX;                   \
+	MOVQ  c, BX;                   \
+	MOVL  flag, CX;                \
+	MOVQ  blocks_base, SI;         \
+	MOVQ  blocks_len, DX;          \
+	                               \
+	MOVQ  SP, BP;                  \
+	ADDQ  $15, BP;                 \
+	ANDQ  $~15, BP;                \
+	                               \
+	MOVQ  0(BX), R9;               \
+	MOVQ  R9, 0(BP);               \
+	MOVQ  CX, 8(BP);               \
+	                               \
+	MOVOU 0(AX), X0;               \
+	MOVOU 16(AX), X1;              \
+	MOVOU iv0<>(SB), X2;           \
+	MOVOU iv1<>(SB), X3            \
+	                               \
+	MOVOU counter<>(SB), X12;      \
+	MOVOU rol16<>(SB), X13;        \
+	MOVOU rol8<>(SB), X14;         \
+	MOVO  0(BP), X15;              \
+	                               \
+	loop:                          \
+	MOVO  X0, X4;                  \
+	MOVO  X1, X5;                  \
+	MOVO  X2, X6;                  \
+	MOVO  X3, X7;                  \
+	                               \
+	PADDQ X12, X15;                \
+	PXOR  X15, X7;                 \
+	                               \
+	BLAKE2s_FUNC();                \
+	                               \
+	PXOR  X4, X0;                  \
+	PXOR  X5, X1;                  \
+	PXOR  X6, X0;                  \
+	PXOR  X7, X1;                  \
+	                               \
+	LEAQ  64(SI), SI;              \
+	SUBQ  $64, DX;                 \
+	JNE   loop;                    \
+	                               \
+	MOVO  X15, 0(BP);              \
+	MOVQ  0(BP), R9;               \
+	MOVQ  R9, 0(BX);               \
+	                               \
+	MOVOU X0, 0(AX);               \
+	MOVOU X1, 16(AX)
+
+// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment
+	HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2)
+	RET
+
+// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment
+	HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3)
+	RET
+
+// func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment
+	HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4)
+	RET
@@ -0,0 +1,178 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blake2s
+
+import (
+	"math/bits"
+)
+
+// the precomputed values for BLAKE2s
+// there are 10 16-byte arrays - one for each round
+// the entries are calculated from the sigma constants.
+var precomputed = [10][16]byte{
+	{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
+	{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
+	{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
+	{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
+	{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
+	{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
+	{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
+	{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
+	{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
+	{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
+}
+
+func hashBlocksGeneric(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
+	var m [16]uint32
+	c0, c1 := c[0], c[1]
+
+	for i := 0; i < len(blocks); {
+		c0 += BlockSize
+		if c0 < BlockSize {
+			c1++
+		}
+
+		v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
+		v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7]
+		v12 ^= c0
+		v13 ^= c1
+		v14 ^= flag
+
+		for j := range m {
+			m[j] = uint32(blocks[i]) | uint32(blocks[i+1])<<8 | uint32(blocks[i+2])<<16 | uint32(blocks[i+3])<<24
+			i += 4
+		}
+
+		for k := range precomputed {
+			s := &(precomputed[k])
+
+			v0 += m[s[0]]
+			v0 += v4
+			v12 ^= v0
+			v12 = bits.RotateLeft32(v12, -16)
+			v8 += v12
+			v4 ^= v8
+			v4 = bits.RotateLeft32(v4, -12)
+			v1 += m[s[1]]
+			v1 += v5
+			v13 ^= v1
+			v13 = bits.RotateLeft32(v13, -16)
+			v9 += v13
+			v5 ^= v9
+			v5 = bits.RotateLeft32(v5, -12)
+			v2 += m[s[2]]
+			v2 += v6
+			v14 ^= v2
+			v14 = bits.RotateLeft32(v14, -16)
+			v10 += v14
+			v6 ^= v10
+			v6 = bits.RotateLeft32(v6, -12)
+			v3 += m[s[3]]
+			v3 += v7
+			v15 ^= v3
+			v15 = bits.RotateLeft32(v15, -16)
+			v11 += v15
+			v7 ^= v11
+			v7 = bits.RotateLeft32(v7, -12)
+
+			v0 += m[s[4]]
+			v0 += v4
+			v12 ^= v0
+			v12 = bits.RotateLeft32(v12, -8)
+			v8 += v12
+			v4 ^= v8
+			v4 = bits.RotateLeft32(v4, -7)
+			v1 += m[s[5]]
+			v1 += v5
+			v13 ^= v1
+			v13 = bits.RotateLeft32(v13, -8)
+			v9 += v13
+			v5 ^= v9
+			v5 = bits.RotateLeft32(v5, -7)
+			v2 += m[s[6]]
+			v2 += v6
+			v14 ^= v2
+			v14 = bits.RotateLeft32(v14, -8)
+			v10 += v14
+			v6 ^= v10
+			v6 = bits.RotateLeft32(v6, -7)
+			v3 += m[s[7]]
+			v3 += v7
+			v15 ^= v3
+			v15 = bits.RotateLeft32(v15, -8)
+			v11 += v15
+			v7 ^= v11
+			v7 = bits.RotateLeft32(v7, -7)
+
+			v0 += m[s[8]]
+			v0 += v5
+			v15 ^= v0
+			v15 = bits.RotateLeft32(v15, -16)
+			v10 += v15
+			v5 ^= v10
+			v5 = bits.RotateLeft32(v5, -12)
+			v1 += m[s[9]]
+			v1 += v6
+			v12 ^= v1
+			v12 = bits.RotateLeft32(v12, -16)
+			v11 += v12
+			v6 ^= v11
+			v6 = bits.RotateLeft32(v6, -12)
+			v2 += m[s[10]]
+			v2 += v7
+			v13 ^= v2
+			v13 = bits.RotateLeft32(v13, -16)
+			v8 += v13
+			v7 ^= v8
+			v7 = bits.RotateLeft32(v7, -12)
+			v3 += m[s[11]]
+			v3 += v4
+			v14 ^= v3
+			v14 = bits.RotateLeft32(v14, -16)
+			v9 += v14
+			v4 ^= v9
+			v4 = bits.RotateLeft32(v4, -12)
+
+			v0 += m[s[12]]
+			v0 += v5
+			v15 ^= v0
+			v15 = bits.RotateLeft32(v15, -8)
+			v10 += v15
+			v5 ^= v10
+			v5 = bits.RotateLeft32(v5, -7)
+			v1 += m[s[13]]
+			v1 += v6
+			v12 ^= v1
+			v12 = bits.RotateLeft32(v12, -8)
+			v11 += v12
+			v6 ^= v11
+			v6 = bits.RotateLeft32(v6, -7)
+			v2 += m[s[14]]
+			v2 += v7
+			v13 ^= v2
+			v13 = bits.RotateLeft32(v13, -8)
+			v8 += v13
+			v7 ^= v8
+			v7 = bits.RotateLeft32(v7, -7)
+			v3 += m[s[15]]
+			v3 += v4
+			v14 ^= v3
+			v14 = bits.RotateLeft32(v14, -8)
+			v9 += v14
+			v4 ^= v9
+			v4 = bits.RotateLeft32(v4, -7)
+		}
+
+		h[0] ^= v0 ^ v8
+		h[1] ^= v1 ^ v9
+		h[2] ^= v2 ^ v10
+		h[3] ^= v3 ^ v11
+		h[4] ^= v4 ^ v12
+		h[5] ^= v5 ^ v13
+		h[6] ^= v6 ^ v14
+		h[7] ^= v7 ^ v15
+	}
+	c[0], c[1] = c0, c1
+}
@@ -0,0 +1,18 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !386) || !gc || purego
+// +build !amd64,!386 !gc purego
+
+package blake2s
+
+var (
+	useSSE4  = false
+	useSSSE3 = false
+	useSSE2  = false
+)
+
+func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
+	hashBlocksGeneric(h, c, flag, blocks)
+}
@@ -0,0 +1,178 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blake2s
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+// XOF defines the interface to hash functions that
+// support arbitrary-length output.
+type XOF interface {
+	// Write absorbs more data into the hash's state. It panics if called
+	// after Read.
+	io.Writer
+
+	// Read reads more output from the hash. It returns io.EOF if the limit
+	// has been reached.
+	io.Reader
+
+	// Clone returns a copy of the XOF in its current state.
+	Clone() XOF
+
+	// Reset resets the XOF to its initial state.
+	Reset()
+}
+
+// OutputLengthUnknown can be used as the size argument to NewXOF to indicate
+// the length of the output is not known in advance.
+const OutputLengthUnknown = 0
+
+// magicUnknownOutputLength is a magic value for the output size that indicates
+// an unknown number of output bytes.
+const magicUnknownOutputLength = 65535
+
+// maxOutputLength is the absolute maximum number of bytes to produce when the
+// number of output bytes is unknown.
+const maxOutputLength = (1 << 32) * 32
+
+// NewXOF creates a new variable-output-length hash. The hash either produce a
+// known number of bytes (1 <= size < 65535), or an unknown number of bytes
+// (size == OutputLengthUnknown). In the latter case, an absolute limit of
+// 128GiB applies.
+//
+// A non-nil key turns the hash into a MAC. The key must between
+// zero and 32 bytes long.
+func NewXOF(size uint16, key []byte) (XOF, error) {
+	if len(key) > Size {
+		return nil, errKeySize
+	}
+	if size == magicUnknownOutputLength {
+		// 2^16-1 indicates an unknown number of bytes and thus isn't a
+		// valid length.
+		return nil, errors.New("blake2s: XOF length too large")
+	}
+	if size == OutputLengthUnknown {
+		size = magicUnknownOutputLength
+	}
+	x := &xof{
+		d: digest{
+			size:   Size,
+			keyLen: len(key),
+		},
+		length: size,
+	}
+	copy(x.d.key[:], key)
+	x.Reset()
+	return x, nil
+}
+
+type xof struct {
+	d                digest
+	length           uint16
+	remaining        uint64
+	cfg, root, block [Size]byte
+	offset           int
+	nodeOffset       uint32
+	readMode         bool
+}
+
+func (x *xof) Write(p []byte) (n int, err error) {
+	if x.readMode {
+		panic("blake2s: write to XOF after read")
+	}
+	return x.d.Write(p)
+}
+
+func (x *xof) Clone() XOF {
+	clone := *x
+	return &clone
+}
+
+func (x *xof) Reset() {
+	x.cfg[0] = byte(Size)
+	binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length
+	binary.LittleEndian.PutUint16(x.cfg[12:], x.length)    // XOF length
+	x.cfg[15] = byte(Size)                                 // inner hash size
+
+	x.d.Reset()
+	x.d.h[3] ^= uint32(x.length)
+
+	x.remaining = uint64(x.length)
+	if x.remaining == magicUnknownOutputLength {
+		x.remaining = maxOutputLength
+	}
+	x.offset, x.nodeOffset = 0, 0
+	x.readMode = false
+}
+
+func (x *xof) Read(p []byte) (n int, err error) {
+	if !x.readMode {
+		x.d.finalize(&x.root)
+		x.readMode = true
+	}
+
+	if x.remaining == 0 {
+		return 0, io.EOF
+	}
+
+	n = len(p)
+	if uint64(n) > x.remaining {
+		n = int(x.remaining)
+		p = p[:n]
+	}
+
+	if x.offset > 0 {
+		blockRemaining := Size - x.offset
+		if n < blockRemaining {
+			x.offset += copy(p, x.block[x.offset:])
+			x.remaining -= uint64(n)
+			return
+		}
+		copy(p, x.block[x.offset:])
+		p = p[blockRemaining:]
+		x.offset = 0
+		x.remaining -= uint64(blockRemaining)
+	}
+
+	for len(p) >= Size {
+		binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
+		x.nodeOffset++
+
+		x.d.initConfig(&x.cfg)
+		x.d.Write(x.root[:])
+		x.d.finalize(&x.block)
+
+		copy(p, x.block[:])
+		p = p[Size:]
+		x.remaining -= uint64(Size)
+	}
+
+	if todo := len(p); todo > 0 {
+		if x.remaining < uint64(Size) {
+			x.cfg[0] = byte(x.remaining)
+		}
+		binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
+		x.nodeOffset++
+
+		x.d.initConfig(&x.cfg)
+		x.d.Write(x.root[:])
+		x.d.finalize(&x.block)
+
+		x.offset = copy(p, x.block[:todo])
+		x.remaining -= uint64(todo)
+	}
+
+	return
+}
+
+func (d *digest) initConfig(cfg *[Size]byte) {
+	d.offset, d.c[0], d.c[1] = 0, 0, 0
+	for i := range d.h {
+		d.h[i] = iv[i] ^ binary.LittleEndian.Uint32(cfg[i*4:])
+	}
+}
@@ -0,0 +1,22 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.9
+// +build go1.9
+
+package blake2s
+
+import (
+	"crypto"
+	"hash"
+)
+
+func init() {
+	newHash256 := func() hash.Hash {
+		h, _ := New256(nil)
+		return h
+	}
+
+	crypto.RegisterHash(crypto.BLAKE2s_256, newHash256)
+}
@@ -0,0 +1,98 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its
+// extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and
+// draft-irtf-cfrg-xchacha-01.
+package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305"
+
+import (
+	"crypto/cipher"
+	"errors"
+)
+
+const (
+	// KeySize is the size of the key used by this AEAD, in bytes.
+	KeySize = 32
+
+	// NonceSize is the size of the nonce used with the standard variant of this
+	// AEAD, in bytes.
+	//
+	// Note that this is too short to be safely generated at random if the same
+	// key is reused more than 2³² times.
+	NonceSize = 12
+
+	// NonceSizeX is the size of the nonce used with the XChaCha20-Poly1305
+	// variant of this AEAD, in bytes.
+	NonceSizeX = 24
+
+	// Overhead is the size of the Poly1305 authentication tag, and the
+	// difference between a ciphertext length and its plaintext.
+	Overhead = 16
+)
+
+type chacha20poly1305 struct {
+	key [KeySize]byte
+}
+
+// New returns a ChaCha20-Poly1305 AEAD that uses the given 256-bit key.
+func New(key []byte) (cipher.AEAD, error) {
+	if len(key) != KeySize {
+		return nil, errors.New("chacha20poly1305: bad key length")
+	}
+	ret := new(chacha20poly1305)
+	copy(ret.key[:], key)
+	return ret, nil
+}
+
+func (c *chacha20poly1305) NonceSize() int {
+	return NonceSize
+}
+
+func (c *chacha20poly1305) Overhead() int {
+	return Overhead
+}
+
+func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte {
+	if len(nonce) != NonceSize {
+		panic("chacha20poly1305: bad nonce length passed to Seal")
+	}
+
+	if uint64(len(plaintext)) > (1<<38)-64 {
+		panic("chacha20poly1305: plaintext too large")
+	}
+
+	return c.seal(dst, nonce, plaintext, additionalData)
+}
+
+var errOpen = errors.New("chacha20poly1305: message authentication failed")
+
+func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	if len(nonce) != NonceSize {
+		panic("chacha20poly1305: bad nonce length passed to Open")
+	}
+	if len(ciphertext) < 16 {
+		return nil, errOpen
+	}
+	if uint64(len(ciphertext)) > (1<<38)-48 {
+		panic("chacha20poly1305: ciphertext too large")
+	}
+
+	return c.open(dst, nonce, ciphertext, additionalData)
+}
+
+// sliceForAppend takes a slice and a requested number of bytes. It returns a
+// slice with the contents of the given slice followed by that many bytes and a
+// second slice that aliases into it and contains only the extra bytes. If the
+// original slice has sufficient capacity then no allocation is performed.
+func sliceForAppend(in []byte, n int) (head, tail []byte) {
+	if total := len(in) + n; cap(in) >= total {
+		head = in[:total]
+	} else {
+		head = make([]byte, total)
+		copy(head, in)
+	}
+	tail = head[len(in):]
+	return
+}
@@ -0,0 +1,87 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+package chacha20poly1305
+
+import (
+	"encoding/binary"
+
+	"golang.org/x/crypto/internal/alias"
+	"golang.org/x/sys/cpu"
+)
+
+//go:noescape
+func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool
+
+//go:noescape
+func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte)
+
+var (
+	useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
+)
+
+// setupState writes a ChaCha20 input matrix to state. See
+// https://tools.ietf.org/html/rfc7539#section-2.3.
+func setupState(state *[16]uint32, key *[32]byte, nonce []byte) {
+	state[0] = 0x61707865
+	state[1] = 0x3320646e
+	state[2] = 0x79622d32
+	state[3] = 0x6b206574
+
+	state[4] = binary.LittleEndian.Uint32(key[0:4])
+	state[5] = binary.LittleEndian.Uint32(key[4:8])
+	state[6] = binary.LittleEndian.Uint32(key[8:12])
+	state[7] = binary.LittleEndian.Uint32(key[12:16])
+	state[8] = binary.LittleEndian.Uint32(key[16:20])
+	state[9] = binary.LittleEndian.Uint32(key[20:24])
+	state[10] = binary.LittleEndian.Uint32(key[24:28])
+	state[11] = binary.LittleEndian.Uint32(key[28:32])
+
+	state[12] = 0
+	state[13] = binary.LittleEndian.Uint32(nonce[0:4])
+	state[14] = binary.LittleEndian.Uint32(nonce[4:8])
+	state[15] = binary.LittleEndian.Uint32(nonce[8:12])
+}
+
+func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
+	if !cpu.X86.HasSSSE3 {
+		return c.sealGeneric(dst, nonce, plaintext, additionalData)
+	}
+
+	var state [16]uint32
+	setupState(&state, &c.key, nonce)
+
+	ret, out := sliceForAppend(dst, len(plaintext)+16)
+	if alias.InexactOverlap(out, plaintext) {
+		panic("chacha20poly1305: invalid buffer overlap")
+	}
+	chacha20Poly1305Seal(out[:], state[:], plaintext, additionalData)
+	return ret
+}
+
+func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	if !cpu.X86.HasSSSE3 {
+		return c.openGeneric(dst, nonce, ciphertext, additionalData)
+	}
+
+	var state [16]uint32
+	setupState(&state, &c.key, nonce)
+
+	ciphertext = ciphertext[:len(ciphertext)-16]
+	ret, out := sliceForAppend(dst, len(ciphertext))
+	if alias.InexactOverlap(out, ciphertext) {
+		panic("chacha20poly1305: invalid buffer overlap")
+	}
+	if !chacha20Poly1305Open(out, state[:], ciphertext, additionalData) {
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	return ret, nil
+}
@@ -0,0 +1,81 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package chacha20poly1305
+
+import (
+	"encoding/binary"
+
+	"golang.org/x/crypto/chacha20"
+	"golang.org/x/crypto/internal/alias"
+	"golang.org/x/crypto/internal/poly1305"
+)
+
+func writeWithPadding(p *poly1305.MAC, b []byte) {
+	p.Write(b)
+	if rem := len(b) % 16; rem != 0 {
+		var buf [16]byte
+		padLen := 16 - rem
+		p.Write(buf[:padLen])
+	}
+}
+
+func writeUint64(p *poly1305.MAC, n int) {
+	var buf [8]byte
+	binary.LittleEndian.PutUint64(buf[:], uint64(n))
+	p.Write(buf[:])
+}
+
+func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte {
+	ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize)
+	ciphertext, tag := out[:len(plaintext)], out[len(plaintext):]
+	if alias.InexactOverlap(out, plaintext) {
+		panic("chacha20poly1305: invalid buffer overlap")
+	}
+
+	var polyKey [32]byte
+	s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
+	s.XORKeyStream(polyKey[:], polyKey[:])
+	s.SetCounter(1) // set the counter to 1, skipping 32 bytes
+	s.XORKeyStream(ciphertext, plaintext)
+
+	p := poly1305.New(&polyKey)
+	writeWithPadding(p, additionalData)
+	writeWithPadding(p, ciphertext)
+	writeUint64(p, len(additionalData))
+	writeUint64(p, len(plaintext))
+	p.Sum(tag[:0])
+
+	return ret
+}
+
+func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	tag := ciphertext[len(ciphertext)-16:]
+	ciphertext = ciphertext[:len(ciphertext)-16]
+
+	var polyKey [32]byte
+	s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
+	s.XORKeyStream(polyKey[:], polyKey[:])
+	s.SetCounter(1) // set the counter to 1, skipping 32 bytes
+
+	p := poly1305.New(&polyKey)
+	writeWithPadding(p, additionalData)
+	writeWithPadding(p, ciphertext)
+	writeUint64(p, len(additionalData))
+	writeUint64(p, len(ciphertext))
+
+	ret, out := sliceForAppend(dst, len(ciphertext))
+	if alias.InexactOverlap(out, ciphertext) {
+		panic("chacha20poly1305: invalid buffer overlap")
+	}
+	if !p.Verify(tag) {
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	s.XORKeyStream(out, ciphertext)
+	return ret, nil
+}
@@ -0,0 +1,16 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || !gc || purego
+// +build !amd64 !gc purego
+
+package chacha20poly1305
+
+func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
+	return c.sealGeneric(dst, nonce, plaintext, additionalData)
+}
+
+func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	return c.openGeneric(dst, nonce, ciphertext, additionalData)
+}
@@ -0,0 +1,86 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package chacha20poly1305
+
+import (
+	"crypto/cipher"
+	"errors"
+
+	"golang.org/x/crypto/chacha20"
+)
+
+type xchacha20poly1305 struct {
+	key [KeySize]byte
+}
+
+// NewX returns a XChaCha20-Poly1305 AEAD that uses the given 256-bit key.
+//
+// XChaCha20-Poly1305 is a ChaCha20-Poly1305 variant that takes a longer nonce,
+// suitable to be generated randomly without risk of collisions. It should be
+// preferred when nonce uniqueness cannot be trivially ensured, or whenever
+// nonces are randomly generated.
+func NewX(key []byte) (cipher.AEAD, error) {
+	if len(key) != KeySize {
+		return nil, errors.New("chacha20poly1305: bad key length")
+	}
+	ret := new(xchacha20poly1305)
+	copy(ret.key[:], key)
+	return ret, nil
+}
+
+func (*xchacha20poly1305) NonceSize() int {
+	return NonceSizeX
+}
+
+func (*xchacha20poly1305) Overhead() int {
+	return Overhead
+}
+
+func (x *xchacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte {
+	if len(nonce) != NonceSizeX {
+		panic("chacha20poly1305: bad nonce length passed to Seal")
+	}
+
+	// XChaCha20-Poly1305 technically supports a 64-bit counter, so there is no
+	// size limit. However, since we reuse the ChaCha20-Poly1305 implementation,
+	// the second half of the counter is not available. This is unlikely to be
+	// an issue because the cipher.AEAD API requires the entire message to be in
+	// memory, and the counter overflows at 256 GB.
+	if uint64(len(plaintext)) > (1<<38)-64 {
+		panic("chacha20poly1305: plaintext too large")
+	}
+
+	c := new(chacha20poly1305)
+	hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16])
+	copy(c.key[:], hKey)
+
+	// The first 4 bytes of the final nonce are unused counter space.
+	cNonce := make([]byte, NonceSize)
+	copy(cNonce[4:12], nonce[16:24])
+
+	return c.seal(dst, cNonce[:], plaintext, additionalData)
+}
+
+func (x *xchacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	if len(nonce) != NonceSizeX {
+		panic("chacha20poly1305: bad nonce length passed to Open")
+	}
+	if len(ciphertext) < 16 {
+		return nil, errOpen
+	}
+	if uint64(len(ciphertext)) > (1<<38)-48 {
+		panic("chacha20poly1305: ciphertext too large")
+	}
+
+	c := new(chacha20poly1305)
+	hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16])
+	copy(c.key[:], hKey)
+
+	// The first 4 bytes of the final nonce are unused counter space.
+	cNonce := make([]byte, NonceSize)
+	copy(cNonce[4:12], nonce[16:24])
+
+	return c.open(dst, cNonce[:], ciphertext, additionalData)
+}
@@ -0,0 +1,824 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cryptobyte
+
+import (
+	encoding_asn1 "encoding/asn1"
+	"fmt"
+	"math/big"
+	"reflect"
+	"time"
+
+	"golang.org/x/crypto/cryptobyte/asn1"
+)
+
+// This file contains ASN.1-related methods for String and Builder.
+
+// Builder
+
+// AddASN1Int64 appends a DER-encoded ASN.1 INTEGER.
+func (b *Builder) AddASN1Int64(v int64) {
+	b.addASN1Signed(asn1.INTEGER, v)
+}
+
+// AddASN1Int64WithTag appends a DER-encoded ASN.1 INTEGER with the
+// given tag.
+func (b *Builder) AddASN1Int64WithTag(v int64, tag asn1.Tag) {
+	b.addASN1Signed(tag, v)
+}
+
+// AddASN1Enum appends a DER-encoded ASN.1 ENUMERATION.
+func (b *Builder) AddASN1Enum(v int64) {
+	b.addASN1Signed(asn1.ENUM, v)
+}
+
+func (b *Builder) addASN1Signed(tag asn1.Tag, v int64) {
+	b.AddASN1(tag, func(c *Builder) {
+		length := 1
+		for i := v; i >= 0x80 || i < -0x80; i >>= 8 {
+			length++
+		}
+
+		for ; length > 0; length-- {
+			i := v >> uint((length-1)*8) & 0xff
+			c.AddUint8(uint8(i))
+		}
+	})
+}
+
+// AddASN1Uint64 appends a DER-encoded ASN.1 INTEGER.
+func (b *Builder) AddASN1Uint64(v uint64) {
+	b.AddASN1(asn1.INTEGER, func(c *Builder) {
+		length := 1
+		for i := v; i >= 0x80; i >>= 8 {
+			length++
+		}
+
+		for ; length > 0; length-- {
+			i := v >> uint((length-1)*8) & 0xff
+			c.AddUint8(uint8(i))
+		}
+	})
+}
+
+// AddASN1BigInt appends a DER-encoded ASN.1 INTEGER.
+func (b *Builder) AddASN1BigInt(n *big.Int) {
+	if b.err != nil {
+		return
+	}
+
+	b.AddASN1(asn1.INTEGER, func(c *Builder) {
+		if n.Sign() < 0 {
+			// A negative number has to be converted to two's-complement form. So we
+			// invert and subtract 1. If the most-significant-bit isn't set then
+			// we'll need to pad the beginning with 0xff in order to keep the number
+			// negative.
+			nMinus1 := new(big.Int).Neg(n)
+			nMinus1.Sub(nMinus1, bigOne)
+			bytes := nMinus1.Bytes()
+			for i := range bytes {
+				bytes[i] ^= 0xff
+			}
+			if len(bytes) == 0 || bytes[0]&0x80 == 0 {
+				c.add(0xff)
+			}
+			c.add(bytes...)
+		} else if n.Sign() == 0 {
+			c.add(0)
+		} else {
+			bytes := n.Bytes()
+			if bytes[0]&0x80 != 0 {
+				c.add(0)
+			}
+			c.add(bytes...)
+		}
+	})
+}
+
+// AddASN1OctetString appends a DER-encoded ASN.1 OCTET STRING.
+func (b *Builder) AddASN1OctetString(bytes []byte) {
+	b.AddASN1(asn1.OCTET_STRING, func(c *Builder) {
+		c.AddBytes(bytes)
+	})
+}
+
+const generalizedTimeFormatStr = "20060102150405Z0700"
+
+// AddASN1GeneralizedTime appends a DER-encoded ASN.1 GENERALIZEDTIME.
+func (b *Builder) AddASN1GeneralizedTime(t time.Time) {
+	if t.Year() < 0 || t.Year() > 9999 {
+		b.err = fmt.Errorf("cryptobyte: cannot represent %v as a GeneralizedTime", t)
+		return
+	}
+	b.AddASN1(asn1.GeneralizedTime, func(c *Builder) {
+		c.AddBytes([]byte(t.Format(generalizedTimeFormatStr)))
+	})
+}
+
+// AddASN1UTCTime appends a DER-encoded ASN.1 UTCTime.
+func (b *Builder) AddASN1UTCTime(t time.Time) {
+	b.AddASN1(asn1.UTCTime, func(c *Builder) {
+		// As utilized by the X.509 profile, UTCTime can only
+		// represent the years 1950 through 2049.
+		if t.Year() < 1950 || t.Year() >= 2050 {
+			b.err = fmt.Errorf("cryptobyte: cannot represent %v as a UTCTime", t)
+			return
+		}
+		c.AddBytes([]byte(t.Format(defaultUTCTimeFormatStr)))
+	})
+}
+
+// AddASN1BitString appends a DER-encoded ASN.1 BIT STRING. This does not
+// support BIT STRINGs that are not a whole number of bytes.
+func (b *Builder) AddASN1BitString(data []byte) {
+	b.AddASN1(asn1.BIT_STRING, func(b *Builder) {
+		b.AddUint8(0)
+		b.AddBytes(data)
+	})
+}
+
+func (b *Builder) addBase128Int(n int64) {
+	var length int
+	if n == 0 {
+		length = 1
+	} else {
+		for i := n; i > 0; i >>= 7 {
+			length++
+		}
+	}
+
+	for i := length - 1; i >= 0; i-- {
+		o := byte(n >> uint(i*7))
+		o &= 0x7f
+		if i != 0 {
+			o |= 0x80
+		}
+
+		b.add(o)
+	}
+}
+
+func isValidOID(oid encoding_asn1.ObjectIdentifier) bool {
+	if len(oid) < 2 {
+		return false
+	}
+
+	if oid[0] > 2 || (oid[0] <= 1 && oid[1] >= 40) {
+		return false
+	}
+
+	for _, v := range oid {
+		if v < 0 {
+			return false
+		}
+	}
+
+	return true
+}
+
+func (b *Builder) AddASN1ObjectIdentifier(oid encoding_asn1.ObjectIdentifier) {
+	b.AddASN1(asn1.OBJECT_IDENTIFIER, func(b *Builder) {
+		if !isValidOID(oid) {
+			b.err = fmt.Errorf("cryptobyte: invalid OID: %v", oid)
+			return
+		}
+
+		b.addBase128Int(int64(oid[0])*40 + int64(oid[1]))
+		for _, v := range oid[2:] {
+			b.addBase128Int(int64(v))
+		}
+	})
+}
+
+func (b *Builder) AddASN1Boolean(v bool) {
+	b.AddASN1(asn1.BOOLEAN, func(b *Builder) {
+		if v {
+			b.AddUint8(0xff)
+		} else {
+			b.AddUint8(0)
+		}
+	})
+}
+
+func (b *Builder) AddASN1NULL() {
+	b.add(uint8(asn1.NULL), 0)
+}
+
+// MarshalASN1 calls encoding_asn1.Marshal on its input and appends the result if
+// successful or records an error if one occurred.
+func (b *Builder) MarshalASN1(v interface{}) {
+	// NOTE(martinkr): This is somewhat of a hack to allow propagation of
+	// encoding_asn1.Marshal errors into Builder.err. N.B. if you call MarshalASN1 with a
+	// value embedded into a struct, its tag information is lost.
+	if b.err != nil {
+		return
+	}
+	bytes, err := encoding_asn1.Marshal(v)
+	if err != nil {
+		b.err = err
+		return
+	}
+	b.AddBytes(bytes)
+}
+
+// AddASN1 appends an ASN.1 object. The object is prefixed with the given tag.
+// Tags greater than 30 are not supported and result in an error (i.e.
+// low-tag-number form only). The child builder passed to the
+// BuilderContinuation can be used to build the content of the ASN.1 object.
+func (b *Builder) AddASN1(tag asn1.Tag, f BuilderContinuation) {
+	if b.err != nil {
+		return
+	}
+	// Identifiers with the low five bits set indicate high-tag-number format
+	// (two or more octets), which we don't support.
+	if tag&0x1f == 0x1f {
+		b.err = fmt.Errorf("cryptobyte: high-tag number identifier octects not supported: 0x%x", tag)
+		return
+	}
+	b.AddUint8(uint8(tag))
+	b.addLengthPrefixed(1, true, f)
+}
+
+// String
+
+// ReadASN1Boolean decodes an ASN.1 BOOLEAN and converts it to a boolean
+// representation into out and advances. It reports whether the read
+// was successful.
+func (s *String) ReadASN1Boolean(out *bool) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.BOOLEAN) || len(bytes) != 1 {
+		return false
+	}
+
+	switch bytes[0] {
+	case 0:
+		*out = false
+	case 0xff:
+		*out = true
+	default:
+		return false
+	}
+
+	return true
+}
+
+// ReadASN1Integer decodes an ASN.1 INTEGER into out and advances. If out does
+// not point to an integer, to a big.Int, or to a []byte it panics. Only
+// positive and zero values can be decoded into []byte, and they are returned as
+// big-endian binary values that share memory with s. Positive values will have
+// no leading zeroes, and zero will be returned as a single zero byte.
+// ReadASN1Integer reports whether the read was successful.
+func (s *String) ReadASN1Integer(out interface{}) bool {
+	switch out := out.(type) {
+	case *int, *int8, *int16, *int32, *int64:
+		var i int64
+		if !s.readASN1Int64(&i) || reflect.ValueOf(out).Elem().OverflowInt(i) {
+			return false
+		}
+		reflect.ValueOf(out).Elem().SetInt(i)
+		return true
+	case *uint, *uint8, *uint16, *uint32, *uint64:
+		var u uint64
+		if !s.readASN1Uint64(&u) || reflect.ValueOf(out).Elem().OverflowUint(u) {
+			return false
+		}
+		reflect.ValueOf(out).Elem().SetUint(u)
+		return true
+	case *big.Int:
+		return s.readASN1BigInt(out)
+	case *[]byte:
+		return s.readASN1Bytes(out)
+	default:
+		panic("out does not point to an integer type")
+	}
+}
+
+func checkASN1Integer(bytes []byte) bool {
+	if len(bytes) == 0 {
+		// An INTEGER is encoded with at least one octet.
+		return false
+	}
+	if len(bytes) == 1 {
+		return true
+	}
+	if bytes[0] == 0 && bytes[1]&0x80 == 0 || bytes[0] == 0xff && bytes[1]&0x80 == 0x80 {
+		// Value is not minimally encoded.
+		return false
+	}
+	return true
+}
+
+var bigOne = big.NewInt(1)
+
+func (s *String) readASN1BigInt(out *big.Int) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.INTEGER) || !checkASN1Integer(bytes) {
+		return false
+	}
+	if bytes[0]&0x80 == 0x80 {
+		// Negative number.
+		neg := make([]byte, len(bytes))
+		for i, b := range bytes {
+			neg[i] = ^b
+		}
+		out.SetBytes(neg)
+		out.Add(out, bigOne)
+		out.Neg(out)
+	} else {
+		out.SetBytes(bytes)
+	}
+	return true
+}
+
+func (s *String) readASN1Bytes(out *[]byte) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.INTEGER) || !checkASN1Integer(bytes) {
+		return false
+	}
+	if bytes[0]&0x80 == 0x80 {
+		return false
+	}
+	for len(bytes) > 1 && bytes[0] == 0 {
+		bytes = bytes[1:]
+	}
+	*out = bytes
+	return true
+}
+
+func (s *String) readASN1Int64(out *int64) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.INTEGER) || !checkASN1Integer(bytes) || !asn1Signed(out, bytes) {
+		return false
+	}
+	return true
+}
+
+func asn1Signed(out *int64, n []byte) bool {
+	length := len(n)
+	if length > 8 {
+		return false
+	}
+	for i := 0; i < length; i++ {
+		*out <<= 8
+		*out |= int64(n[i])
+	}
+	// Shift up and down in order to sign extend the result.
+	*out <<= 64 - uint8(length)*8
+	*out >>= 64 - uint8(length)*8
+	return true
+}
+
+func (s *String) readASN1Uint64(out *uint64) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.INTEGER) || !checkASN1Integer(bytes) || !asn1Unsigned(out, bytes) {
+		return false
+	}
+	return true
+}
+
+func asn1Unsigned(out *uint64, n []byte) bool {
+	length := len(n)
+	if length > 9 || length == 9 && n[0] != 0 {
+		// Too large for uint64.
+		return false
+	}
+	if n[0]&0x80 != 0 {
+		// Negative number.
+		return false
+	}
+	for i := 0; i < length; i++ {
+		*out <<= 8
+		*out |= uint64(n[i])
+	}
+	return true
+}
+
+// ReadASN1Int64WithTag decodes an ASN.1 INTEGER with the given tag into out
+// and advances. It reports whether the read was successful and resulted in a
+// value that can be represented in an int64.
+func (s *String) ReadASN1Int64WithTag(out *int64, tag asn1.Tag) bool {
+	var bytes String
+	return s.ReadASN1(&bytes, tag) && checkASN1Integer(bytes) && asn1Signed(out, bytes)
+}
+
+// ReadASN1Enum decodes an ASN.1 ENUMERATION into out and advances. It reports
+// whether the read was successful.
+func (s *String) ReadASN1Enum(out *int) bool {
+	var bytes String
+	var i int64
+	if !s.ReadASN1(&bytes, asn1.ENUM) || !checkASN1Integer(bytes) || !asn1Signed(&i, bytes) {
+		return false
+	}
+	if int64(int(i)) != i {
+		return false
+	}
+	*out = int(i)
+	return true
+}
+
+func (s *String) readBase128Int(out *int) bool {
+	ret := 0
+	for i := 0; len(*s) > 0; i++ {
+		if i == 5 {
+			return false
+		}
+		// Avoid overflowing int on a 32-bit platform.
+		// We don't want different behavior based on the architecture.
+		if ret >= 1<<(31-7) {
+			return false
+		}
+		ret <<= 7
+		b := s.read(1)[0]
+
+		// ITU-T X.690, section 8.19.2:
+		// The subidentifier shall be encoded in the fewest possible octets,
+		// that is, the leading octet of the subidentifier shall not have the value 0x80.
+		if i == 0 && b == 0x80 {
+			return false
+		}
+
+		ret |= int(b & 0x7f)
+		if b&0x80 == 0 {
+			*out = ret
+			return true
+		}
+	}
+	return false // truncated
+}
+
+// ReadASN1ObjectIdentifier decodes an ASN.1 OBJECT IDENTIFIER into out and
+// advances. It reports whether the read was successful.
+func (s *String) ReadASN1ObjectIdentifier(out *encoding_asn1.ObjectIdentifier) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.OBJECT_IDENTIFIER) || len(bytes) == 0 {
+		return false
+	}
+
+	// In the worst case, we get two elements from the first byte (which is
+	// encoded differently) and then every varint is a single byte long.
+	components := make([]int, len(bytes)+1)
+
+	// The first varint is 40*value1 + value2:
+	// According to this packing, value1 can take the values 0, 1 and 2 only.
+	// When value1 = 0 or value1 = 1, then value2 is <= 39. When value1 = 2,
+	// then there are no restrictions on value2.
+	var v int
+	if !bytes.readBase128Int(&v) {
+		return false
+	}
+	if v < 80 {
+		components[0] = v / 40
+		components[1] = v % 40
+	} else {
+		components[0] = 2
+		components[1] = v - 80
+	}
+
+	i := 2
+	for ; len(bytes) > 0; i++ {
+		if !bytes.readBase128Int(&v) {
+			return false
+		}
+		components[i] = v
+	}
+	*out = components[:i]
+	return true
+}
+
+// ReadASN1GeneralizedTime decodes an ASN.1 GENERALIZEDTIME into out and
+// advances. It reports whether the read was successful.
+func (s *String) ReadASN1GeneralizedTime(out *time.Time) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.GeneralizedTime) {
+		return false
+	}
+	t := string(bytes)
+	res, err := time.Parse(generalizedTimeFormatStr, t)
+	if err != nil {
+		return false
+	}
+	if serialized := res.Format(generalizedTimeFormatStr); serialized != t {
+		return false
+	}
+	*out = res
+	return true
+}
+
+const defaultUTCTimeFormatStr = "060102150405Z0700"
+
+// ReadASN1UTCTime decodes an ASN.1 UTCTime into out and advances.
+// It reports whether the read was successful.
+func (s *String) ReadASN1UTCTime(out *time.Time) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.UTCTime) {
+		return false
+	}
+	t := string(bytes)
+
+	formatStr := defaultUTCTimeFormatStr
+	var err error
+	res, err := time.Parse(formatStr, t)
+	if err != nil {
+		// Fallback to minute precision if we can't parse second
+		// precision. If we are following X.509 or X.690 we shouldn't
+		// support this, but we do.
+		formatStr = "0601021504Z0700"
+		res, err = time.Parse(formatStr, t)
+	}
+	if err != nil {
+		return false
+	}
+
+	if serialized := res.Format(formatStr); serialized != t {
+		return false
+	}
+
+	if res.Year() >= 2050 {
+		// UTCTime interprets the low order digits 50-99 as 1950-99.
+		// This only applies to its use in the X.509 profile.
+		// See https://tools.ietf.org/html/rfc5280#section-4.1.2.5.1
+		res = res.AddDate(-100, 0, 0)
+	}
+	*out = res
+	return true
+}
+
+// ReadASN1BitString decodes an ASN.1 BIT STRING into out and advances.
+// It reports whether the read was successful.
+func (s *String) ReadASN1BitString(out *encoding_asn1.BitString) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.BIT_STRING) || len(bytes) == 0 ||
+		len(bytes)*8/8 != len(bytes) {
+		return false
+	}
+
+	paddingBits := bytes[0]
+	bytes = bytes[1:]
+	if paddingBits > 7 ||
+		len(bytes) == 0 && paddingBits != 0 ||
+		len(bytes) > 0 && bytes[len(bytes)-1]&(1<<paddingBits-1) != 0 {
+		return false
+	}
+
+	out.BitLength = len(bytes)*8 - int(paddingBits)
+	out.Bytes = bytes
+	return true
+}
+
+// ReadASN1BitStringAsBytes decodes an ASN.1 BIT STRING into out and advances. It is
+// an error if the BIT STRING is not a whole number of bytes. It reports
+// whether the read was successful.
+func (s *String) ReadASN1BitStringAsBytes(out *[]byte) bool {
+	var bytes String
+	if !s.ReadASN1(&bytes, asn1.BIT_STRING) || len(bytes) == 0 {
+		return false
+	}
+
+	paddingBits := bytes[0]
+	if paddingBits != 0 {
+		return false
+	}
+	*out = bytes[1:]
+	return true
+}
+
+// ReadASN1Bytes reads the contents of a DER-encoded ASN.1 element (not including
+// tag and length bytes) into out, and advances. The element must match the
+// given tag. It reports whether the read was successful.
+func (s *String) ReadASN1Bytes(out *[]byte, tag asn1.Tag) bool {
+	return s.ReadASN1((*String)(out), tag)
+}
+
+// ReadASN1 reads the contents of a DER-encoded ASN.1 element (not including
+// tag and length bytes) into out, and advances. The element must match the
+// given tag. It reports whether the read was successful.
+//
+// Tags greater than 30 are not supported (i.e. low-tag-number format only).
+func (s *String) ReadASN1(out *String, tag asn1.Tag) bool {
+	var t asn1.Tag
+	if !s.ReadAnyASN1(out, &t) || t != tag {
+		return false
+	}
+	return true
+}
+
+// ReadASN1Element reads the contents of a DER-encoded ASN.1 element (including
+// tag and length bytes) into out, and advances. The element must match the
+// given tag. It reports whether the read was successful.
+//
+// Tags greater than 30 are not supported (i.e. low-tag-number format only).
+func (s *String) ReadASN1Element(out *String, tag asn1.Tag) bool {
+	var t asn1.Tag
+	if !s.ReadAnyASN1Element(out, &t) || t != tag {
+		return false
+	}
+	return true
+}
+
+// ReadAnyASN1 reads the contents of a DER-encoded ASN.1 element (not including
+// tag and length bytes) into out, sets outTag to its tag, and advances.
+// It reports whether the read was successful.
+//
+// Tags greater than 30 are not supported (i.e. low-tag-number format only).
+func (s *String) ReadAnyASN1(out *String, outTag *asn1.Tag) bool {
+	return s.readASN1(out, outTag, true /* skip header */)
+}
+
+// ReadAnyASN1Element reads the contents of a DER-encoded ASN.1 element
+// (including tag and length bytes) into out, sets outTag to is tag, and
+// advances. It reports whether the read was successful.
+//
+// Tags greater than 30 are not supported (i.e. low-tag-number format only).
+func (s *String) ReadAnyASN1Element(out *String, outTag *asn1.Tag) bool {
+	return s.readASN1(out, outTag, false /* include header */)
+}
+
+// PeekASN1Tag reports whether the next ASN.1 value on the string starts with
+// the given tag.
+func (s String) PeekASN1Tag(tag asn1.Tag) bool {
+	if len(s) == 0 {
+		return false
+	}
+	return asn1.Tag(s[0]) == tag
+}
+
+// SkipASN1 reads and discards an ASN.1 element with the given tag. It
+// reports whether the operation was successful.
+func (s *String) SkipASN1(tag asn1.Tag) bool {
+	var unused String
+	return s.ReadASN1(&unused, tag)
+}
+
+// ReadOptionalASN1 attempts to read the contents of a DER-encoded ASN.1
+// element (not including tag and length bytes) tagged with the given tag into
+// out. It stores whether an element with the tag was found in outPresent,
+// unless outPresent is nil. It reports whether the read was successful.
+func (s *String) ReadOptionalASN1(out *String, outPresent *bool, tag asn1.Tag) bool {
+	present := s.PeekASN1Tag(tag)
+	if outPresent != nil {
+		*outPresent = present
+	}
+	if present && !s.ReadASN1(out, tag) {
+		return false
+	}
+	return true
+}
+
+// SkipOptionalASN1 advances s over an ASN.1 element with the given tag, or
+// else leaves s unchanged. It reports whether the operation was successful.
+func (s *String) SkipOptionalASN1(tag asn1.Tag) bool {
+	if !s.PeekASN1Tag(tag) {
+		return true
+	}
+	var unused String
+	return s.ReadASN1(&unused, tag)
+}
+
+// ReadOptionalASN1Integer attempts to read an optional ASN.1 INTEGER explicitly
+// tagged with tag into out and advances. If no element with a matching tag is
+// present, it writes defaultValue into out instead. Otherwise, it behaves like
+// ReadASN1Integer.
+func (s *String) ReadOptionalASN1Integer(out interface{}, tag asn1.Tag, defaultValue interface{}) bool {
+	var present bool
+	var i String
+	if !s.ReadOptionalASN1(&i, &present, tag) {
+		return false
+	}
+	if !present {
+		switch out.(type) {
+		case *int, *int8, *int16, *int32, *int64,
+			*uint, *uint8, *uint16, *uint32, *uint64, *[]byte:
+			reflect.ValueOf(out).Elem().Set(reflect.ValueOf(defaultValue))
+		case *big.Int:
+			if defaultValue, ok := defaultValue.(*big.Int); ok {
+				out.(*big.Int).Set(defaultValue)
+			} else {
+				panic("out points to big.Int, but defaultValue does not")
+			}
+		default:
+			panic("invalid integer type")
+		}
+		return true
+	}
+	if !i.ReadASN1Integer(out) || !i.Empty() {
+		return false
+	}
+	return true
+}
+
+// ReadOptionalASN1OctetString attempts to read an optional ASN.1 OCTET STRING
+// explicitly tagged with tag into out and advances. If no element with a
+// matching tag is present, it sets "out" to nil instead. It reports
+// whether the read was successful.
+func (s *String) ReadOptionalASN1OctetString(out *[]byte, outPresent *bool, tag asn1.Tag) bool {
+	var present bool
+	var child String
+	if !s.ReadOptionalASN1(&child, &present, tag) {
+		return false
+	}
+	if outPresent != nil {
+		*outPresent = present
+	}
+	if present {
+		var oct String
+		if !child.ReadASN1(&oct, asn1.OCTET_STRING) || !child.Empty() {
+			return false
+		}
+		*out = oct
+	} else {
+		*out = nil
+	}
+	return true
+}
+
+// ReadOptionalASN1Boolean sets *out to the value of the next ASN.1 BOOLEAN or,
+// if the next bytes are not an ASN.1 BOOLEAN, to the value of defaultValue.
+// It reports whether the operation was successful.
+func (s *String) ReadOptionalASN1Boolean(out *bool, defaultValue bool) bool {
+	var present bool
+	var child String
+	if !s.ReadOptionalASN1(&child, &present, asn1.BOOLEAN) {
+		return false
+	}
+
+	if !present {
+		*out = defaultValue
+		return true
+	}
+
+	return s.ReadASN1Boolean(out)
+}
+
+func (s *String) readASN1(out *String, outTag *asn1.Tag, skipHeader bool) bool {
+	if len(*s) < 2 {
+		return false
+	}
+	tag, lenByte := (*s)[0], (*s)[1]
+
+	if tag&0x1f == 0x1f {
+		// ITU-T X.690 section 8.1.2
+		//
+		// An identifier octet with a tag part of 0x1f indicates a high-tag-number
+		// form identifier with two or more octets. We only support tags less than
+		// 31 (i.e. low-tag-number form, single octet identifier).
+		return false
+	}
+
+	if outTag != nil {
+		*outTag = asn1.Tag(tag)
+	}
+
+	// ITU-T X.690 section 8.1.3
+	//
+	// Bit 8 of the first length byte indicates whether the length is short- or
+	// long-form.
+	var length, headerLen uint32 // length includes headerLen
+	if lenByte&0x80 == 0 {
+		// Short-form length (section 8.1.3.4), encoded in bits 1-7.
+		length = uint32(lenByte) + 2
+		headerLen = 2
+	} else {
+		// Long-form length (section 8.1.3.5). Bits 1-7 encode the number of octets
+		// used to encode the length.
+		lenLen := lenByte & 0x7f
+		var len32 uint32
+
+		if lenLen == 0 || lenLen > 4 || len(*s) < int(2+lenLen) {
+			return false
+		}
+
+		lenBytes := String((*s)[2 : 2+lenLen])
+		if !lenBytes.readUnsigned(&len32, int(lenLen)) {
+			return false
+		}
+
+		// ITU-T X.690 section 10.1 (DER length forms) requires encoding the length
+		// with the minimum number of octets.
+		if len32 < 128 {
+			// Length should have used short-form encoding.
+			return false
+		}
+		if len32>>((lenLen-1)*8) == 0 {
+			// Leading octet is 0. Length should have been at least one byte shorter.
+			return false
+		}
+
+		headerLen = 2 + uint32(lenLen)
+		if headerLen+len32 < len32 {
+			// Overflow.
+			return false
+		}
+		length = headerLen + len32
+	}
+
+	if int(length) < 0 || !s.ReadBytes((*[]byte)(out), int(length)) {
+		return false
+	}
+	if skipHeader && !out.Skip(int(headerLen)) {
+		panic("cryptobyte: internal error")
+	}
+
+	return true
+}
@@ -0,0 +1,46 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package asn1 contains supporting types for parsing and building ASN.1
+// messages with the cryptobyte package.
+package asn1 // import "golang.org/x/crypto/cryptobyte/asn1"
+
+// Tag represents an ASN.1 identifier octet, consisting of a tag number
+// (indicating a type) and class (such as context-specific or constructed).
+//
+// Methods in the cryptobyte package only support the low-tag-number form, i.e.
+// a single identifier octet with bits 7-8 encoding the class and bits 1-6
+// encoding the tag number.
+type Tag uint8
+
+const (
+	classConstructed     = 0x20
+	classContextSpecific = 0x80
+)
+
+// Constructed returns t with the constructed class bit set.
+func (t Tag) Constructed() Tag { return t | classConstructed }
+
+// ContextSpecific returns t with the context-specific class bit set.
+func (t Tag) ContextSpecific() Tag { return t | classContextSpecific }
+
+// The following is a list of standard tag and class combinations.
+const (
+	BOOLEAN           = Tag(1)
+	INTEGER           = Tag(2)
+	BIT_STRING        = Tag(3)
+	OCTET_STRING      = Tag(4)
+	NULL              = Tag(5)
+	OBJECT_IDENTIFIER = Tag(6)
+	ENUM              = Tag(10)
+	UTF8String        = Tag(12)
+	SEQUENCE          = Tag(16 | classConstructed)
+	SET               = Tag(17 | classConstructed)
+	PrintableString   = Tag(19)
+	T61String         = Tag(20)
+	IA5String         = Tag(22)
+	UTCTime           = Tag(23)
+	GeneralizedTime   = Tag(24)
+	GeneralString     = Tag(27)
+)
@@ -0,0 +1,345 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cryptobyte
+
+import (
+	"errors"
+	"fmt"
+)
+
+// A Builder builds byte strings from fixed-length and length-prefixed values.
+// Builders either allocate space as needed, or are ‘fixed’, which means that
+// they write into a given buffer and produce an error if it's exhausted.
+//
+// The zero value is a usable Builder that allocates space as needed.
+//
+// Simple values are marshaled and appended to a Builder using methods on the
+// Builder. Length-prefixed values are marshaled by providing a
+// BuilderContinuation, which is a function that writes the inner contents of
+// the value to a given Builder. See the documentation for BuilderContinuation
+// for details.
+type Builder struct {
+	err            error
+	result         []byte
+	fixedSize      bool
+	child          *Builder
+	offset         int
+	pendingLenLen  int
+	pendingIsASN1  bool
+	inContinuation *bool
+}
+
+// NewBuilder creates a Builder that appends its output to the given buffer.
+// Like append(), the slice will be reallocated if its capacity is exceeded.
+// Use Bytes to get the final buffer.
+func NewBuilder(buffer []byte) *Builder {
+	return &Builder{
+		result: buffer,
+	}
+}
+
+// NewFixedBuilder creates a Builder that appends its output into the given
+// buffer. This builder does not reallocate the output buffer. Writes that
+// would exceed the buffer's capacity are treated as an error.
+func NewFixedBuilder(buffer []byte) *Builder {
+	return &Builder{
+		result:    buffer,
+		fixedSize: true,
+	}
+}
+
+// SetError sets the value to be returned as the error from Bytes. Writes
+// performed after calling SetError are ignored.
+func (b *Builder) SetError(err error) {
+	b.err = err
+}
+
+// Bytes returns the bytes written by the builder or an error if one has
+// occurred during building.
+func (b *Builder) Bytes() ([]byte, error) {
+	if b.err != nil {
+		return nil, b.err
+	}
+	return b.result[b.offset:], nil
+}
+
+// BytesOrPanic returns the bytes written by the builder or panics if an error
+// has occurred during building.
+func (b *Builder) BytesOrPanic() []byte {
+	if b.err != nil {
+		panic(b.err)
+	}
+	return b.result[b.offset:]
+}
+
+// AddUint8 appends an 8-bit value to the byte string.
+func (b *Builder) AddUint8(v uint8) {
+	b.add(byte(v))
+}
+
+// AddUint16 appends a big-endian, 16-bit value to the byte string.
+func (b *Builder) AddUint16(v uint16) {
+	b.add(byte(v>>8), byte(v))
+}
+
+// AddUint24 appends a big-endian, 24-bit value to the byte string. The highest
+// byte of the 32-bit input value is silently truncated.
+func (b *Builder) AddUint24(v uint32) {
+	b.add(byte(v>>16), byte(v>>8), byte(v))
+}
+
+// AddUint32 appends a big-endian, 32-bit value to the byte string.
+func (b *Builder) AddUint32(v uint32) {
+	b.add(byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
+}
+
+// AddUint64 appends a big-endian, 64-bit value to the byte string.
+func (b *Builder) AddUint64(v uint64) {
+	b.add(byte(v>>56), byte(v>>48), byte(v>>40), byte(v>>32), byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
+}
+
+// AddBytes appends a sequence of bytes to the byte string.
+func (b *Builder) AddBytes(v []byte) {
+	b.add(v...)
+}
+
+// BuilderContinuation is a continuation-passing interface for building
+// length-prefixed byte sequences. Builder methods for length-prefixed
+// sequences (AddUint8LengthPrefixed etc) will invoke the BuilderContinuation
+// supplied to them. The child builder passed to the continuation can be used
+// to build the content of the length-prefixed sequence. For example:
+//
+//	parent := cryptobyte.NewBuilder()
+//	parent.AddUint8LengthPrefixed(func (child *Builder) {
+//	  child.AddUint8(42)
+//	  child.AddUint8LengthPrefixed(func (grandchild *Builder) {
+//	    grandchild.AddUint8(5)
+//	  })
+//	})
+//
+// It is an error to write more bytes to the child than allowed by the reserved
+// length prefix. After the continuation returns, the child must be considered
+// invalid, i.e. users must not store any copies or references of the child
+// that outlive the continuation.
+//
+// If the continuation panics with a value of type BuildError then the inner
+// error will be returned as the error from Bytes. If the child panics
+// otherwise then Bytes will repanic with the same value.
+type BuilderContinuation func(child *Builder)
+
+// BuildError wraps an error. If a BuilderContinuation panics with this value,
+// the panic will be recovered and the inner error will be returned from
+// Builder.Bytes.
+type BuildError struct {
+	Err error
+}
+
+// AddUint8LengthPrefixed adds a 8-bit length-prefixed byte sequence.
+func (b *Builder) AddUint8LengthPrefixed(f BuilderContinuation) {
+	b.addLengthPrefixed(1, false, f)
+}
+
+// AddUint16LengthPrefixed adds a big-endian, 16-bit length-prefixed byte sequence.
+func (b *Builder) AddUint16LengthPrefixed(f BuilderContinuation) {
+	b.addLengthPrefixed(2, false, f)
+}
+
+// AddUint24LengthPrefixed adds a big-endian, 24-bit length-prefixed byte sequence.
+func (b *Builder) AddUint24LengthPrefixed(f BuilderContinuation) {
+	b.addLengthPrefixed(3, false, f)
+}
+
+// AddUint32LengthPrefixed adds a big-endian, 32-bit length-prefixed byte sequence.
+func (b *Builder) AddUint32LengthPrefixed(f BuilderContinuation) {
+	b.addLengthPrefixed(4, false, f)
+}
+
+func (b *Builder) callContinuation(f BuilderContinuation, arg *Builder) {
+	if !*b.inContinuation {
+		*b.inContinuation = true
+
+		defer func() {
+			*b.inContinuation = false
+
+			r := recover()
+			if r == nil {
+				return
+			}
+
+			if buildError, ok := r.(BuildError); ok {
+				b.err = buildError.Err
+			} else {
+				panic(r)
+			}
+		}()
+	}
+
+	f(arg)
+}
+
+func (b *Builder) addLengthPrefixed(lenLen int, isASN1 bool, f BuilderContinuation) {
+	// Subsequent writes can be ignored if the builder has encountered an error.
+	if b.err != nil {
+		return
+	}
+
+	offset := len(b.result)
+	b.add(make([]byte, lenLen)...)
+
+	if b.inContinuation == nil {
+		b.inContinuation = new(bool)
+	}
+
+	b.child = &Builder{
+		result:         b.result,
+		fixedSize:      b.fixedSize,
+		offset:         offset,
+		pendingLenLen:  lenLen,
+		pendingIsASN1:  isASN1,
+		inContinuation: b.inContinuation,
+	}
+
+	b.callContinuation(f, b.child)
+	b.flushChild()
+	if b.child != nil {
+		panic("cryptobyte: internal error")
+	}
+}
+
+func (b *Builder) flushChild() {
+	if b.child == nil {
+		return
+	}
+	b.child.flushChild()
+	child := b.child
+	b.child = nil
+
+	if child.err != nil {
+		b.err = child.err
+		return
+	}
+
+	length := len(child.result) - child.pendingLenLen - child.offset
+
+	if length < 0 {
+		panic("cryptobyte: internal error") // result unexpectedly shrunk
+	}
+
+	if child.pendingIsASN1 {
+		// For ASN.1, we reserved a single byte for the length. If that turned out
+		// to be incorrect, we have to move the contents along in order to make
+		// space.
+		if child.pendingLenLen != 1 {
+			panic("cryptobyte: internal error")
+		}
+		var lenLen, lenByte uint8
+		if int64(length) > 0xfffffffe {
+			b.err = errors.New("pending ASN.1 child too long")
+			return
+		} else if length > 0xffffff {
+			lenLen = 5
+			lenByte = 0x80 | 4
+		} else if length > 0xffff {
+			lenLen = 4
+			lenByte = 0x80 | 3
+		} else if length > 0xff {
+			lenLen = 3
+			lenByte = 0x80 | 2
+		} else if length > 0x7f {
+			lenLen = 2
+			lenByte = 0x80 | 1
+		} else {
+			lenLen = 1
+			lenByte = uint8(length)
+			length = 0
+		}
+
+		// Insert the initial length byte, make space for successive length bytes,
+		// and adjust the offset.
+		child.result[child.offset] = lenByte
+		extraBytes := int(lenLen - 1)
+		if extraBytes != 0 {
+			child.add(make([]byte, extraBytes)...)
+			childStart := child.offset + child.pendingLenLen
+			copy(child.result[childStart+extraBytes:], child.result[childStart:])
+		}
+		child.offset++
+		child.pendingLenLen = extraBytes
+	}
+
+	l := length
+	for i := child.pendingLenLen - 1; i >= 0; i-- {
+		child.result[child.offset+i] = uint8(l)
+		l >>= 8
+	}
+	if l != 0 {
+		b.err = fmt.Errorf("cryptobyte: pending child length %d exceeds %d-byte length prefix", length, child.pendingLenLen)
+		return
+	}
+
+	if b.fixedSize && &b.result[0] != &child.result[0] {
+		panic("cryptobyte: BuilderContinuation reallocated a fixed-size buffer")
+	}
+
+	b.result = child.result
+}
+
+func (b *Builder) add(bytes ...byte) {
+	if b.err != nil {
+		return
+	}
+	if b.child != nil {
+		panic("cryptobyte: attempted write while child is pending")
+	}
+	if len(b.result)+len(bytes) < len(bytes) {
+		b.err = errors.New("cryptobyte: length overflow")
+	}
+	if b.fixedSize && len(b.result)+len(bytes) > cap(b.result) {
+		b.err = errors.New("cryptobyte: Builder is exceeding its fixed-size buffer")
+		return
+	}
+	b.result = append(b.result, bytes...)
+}
+
+// Unwrite rolls back non-negative n bytes written directly to the Builder.
+// An attempt by a child builder passed to a continuation to unwrite bytes
+// from its parent will panic.
+func (b *Builder) Unwrite(n int) {
+	if b.err != nil {
+		return
+	}
+	if b.child != nil {
+		panic("cryptobyte: attempted unwrite while child is pending")
+	}
+	length := len(b.result) - b.pendingLenLen - b.offset
+	if length < 0 {
+		panic("cryptobyte: internal error")
+	}
+	if n < 0 {
+		panic("cryptobyte: attempted to unwrite negative number of bytes")
+	}
+	if n > length {
+		panic("cryptobyte: attempted to unwrite more than was written")
+	}
+	b.result = b.result[:len(b.result)-n]
+}
+
+// A MarshalingValue marshals itself into a Builder.
+type MarshalingValue interface {
+	// Marshal is called by Builder.AddValue. It receives a pointer to a builder
+	// to marshal itself into. It may return an error that occurred during
+	// marshaling, such as unset or invalid values.
+	Marshal(b *Builder) error
+}
+
+// AddValue calls Marshal on v, passing a pointer to the builder to append to.
+// If Marshal returns an error, it is set on the Builder so that subsequent
+// appends don't have an effect.
+func (b *Builder) AddValue(v MarshalingValue) {
+	err := v.Marshal(b)
+	if err != nil {
+		b.err = err
+	}
+}
@@ -0,0 +1,172 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cryptobyte contains types that help with parsing and constructing
+// length-prefixed, binary messages, including ASN.1 DER. (The asn1 subpackage
+// contains useful ASN.1 constants.)
+//
+// The String type is for parsing. It wraps a []byte slice and provides helper
+// functions for consuming structures, value by value.
+//
+// The Builder type is for constructing messages. It providers helper functions
+// for appending values and also for appending length-prefixed submessages –
+// without having to worry about calculating the length prefix ahead of time.
+//
+// See the documentation and examples for the Builder and String types to get
+// started.
+package cryptobyte // import "golang.org/x/crypto/cryptobyte"
+
+// String represents a string of bytes. It provides methods for parsing
+// fixed-length and length-prefixed values from it.
+type String []byte
+
+// read advances a String by n bytes and returns them. If less than n bytes
+// remain, it returns nil.
+func (s *String) read(n int) []byte {
+	if len(*s) < n || n < 0 {
+		return nil
+	}
+	v := (*s)[:n]
+	*s = (*s)[n:]
+	return v
+}
+
+// Skip advances the String by n byte and reports whether it was successful.
+func (s *String) Skip(n int) bool {
+	return s.read(n) != nil
+}
+
+// ReadUint8 decodes an 8-bit value into out and advances over it.
+// It reports whether the read was successful.
+func (s *String) ReadUint8(out *uint8) bool {
+	v := s.read(1)
+	if v == nil {
+		return false
+	}
+	*out = uint8(v[0])
+	return true
+}
+
+// ReadUint16 decodes a big-endian, 16-bit value into out and advances over it.
+// It reports whether the read was successful.
+func (s *String) ReadUint16(out *uint16) bool {
+	v := s.read(2)
+	if v == nil {
+		return false
+	}
+	*out = uint16(v[0])<<8 | uint16(v[1])
+	return true
+}
+
+// ReadUint24 decodes a big-endian, 24-bit value into out and advances over it.
+// It reports whether the read was successful.
+func (s *String) ReadUint24(out *uint32) bool {
+	v := s.read(3)
+	if v == nil {
+		return false
+	}
+	*out = uint32(v[0])<<16 | uint32(v[1])<<8 | uint32(v[2])
+	return true
+}
+
+// ReadUint32 decodes a big-endian, 32-bit value into out and advances over it.
+// It reports whether the read was successful.
+func (s *String) ReadUint32(out *uint32) bool {
+	v := s.read(4)
+	if v == nil {
+		return false
+	}
+	*out = uint32(v[0])<<24 | uint32(v[1])<<16 | uint32(v[2])<<8 | uint32(v[3])
+	return true
+}
+
+// ReadUint64 decodes a big-endian, 64-bit value into out and advances over it.
+// It reports whether the read was successful.
+func (s *String) ReadUint64(out *uint64) bool {
+	v := s.read(8)
+	if v == nil {
+		return false
+	}
+	*out = uint64(v[0])<<56 | uint64(v[1])<<48 | uint64(v[2])<<40 | uint64(v[3])<<32 | uint64(v[4])<<24 | uint64(v[5])<<16 | uint64(v[6])<<8 | uint64(v[7])
+	return true
+}
+
+func (s *String) readUnsigned(out *uint32, length int) bool {
+	v := s.read(length)
+	if v == nil {
+		return false
+	}
+	var result uint32
+	for i := 0; i < length; i++ {
+		result <<= 8
+		result |= uint32(v[i])
+	}
+	*out = result
+	return true
+}
+
+func (s *String) readLengthPrefixed(lenLen int, outChild *String) bool {
+	lenBytes := s.read(lenLen)
+	if lenBytes == nil {
+		return false
+	}
+	var length uint32
+	for _, b := range lenBytes {
+		length = length << 8
+		length = length | uint32(b)
+	}
+	v := s.read(int(length))
+	if v == nil {
+		return false
+	}
+	*outChild = v
+	return true
+}
+
+// ReadUint8LengthPrefixed reads the content of an 8-bit length-prefixed value
+// into out and advances over it. It reports whether the read was successful.
+func (s *String) ReadUint8LengthPrefixed(out *String) bool {
+	return s.readLengthPrefixed(1, out)
+}
+
+// ReadUint16LengthPrefixed reads the content of a big-endian, 16-bit
+// length-prefixed value into out and advances over it. It reports whether the
+// read was successful.
+func (s *String) ReadUint16LengthPrefixed(out *String) bool {
+	return s.readLengthPrefixed(2, out)
+}
+
+// ReadUint24LengthPrefixed reads the content of a big-endian, 24-bit
+// length-prefixed value into out and advances over it. It reports whether
+// the read was successful.
+func (s *String) ReadUint24LengthPrefixed(out *String) bool {
+	return s.readLengthPrefixed(3, out)
+}
+
+// ReadBytes reads n bytes into out and advances over them. It reports
+// whether the read was successful.
+func (s *String) ReadBytes(out *[]byte, n int) bool {
+	v := s.read(n)
+	if v == nil {
+		return false
+	}
+	*out = v
+	return true
+}
+
+// CopyBytes copies len(out) bytes into out and advances over them. It reports
+// whether the copy operation was successful
+func (s *String) CopyBytes(out []byte) bool {
+	n := len(out)
+	v := s.read(n)
+	if v == nil {
+		return false
+	}
+	return copy(out, v) == n
+}
+
+// Empty reports whether the string does not contain any bytes.
+func (s String) Empty() bool {
+	return len(s) == 0
+}
@@ -0,0 +1,124 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package ripemd160 implements the RIPEMD-160 hash algorithm.
+//
+// Deprecated: RIPEMD-160 is a legacy hash and should not be used for new
+// applications. Also, this package does not and will not provide an optimized
+// implementation. Instead, use a modern hash like SHA-256 (from crypto/sha256).
+package ripemd160 // import "golang.org/x/crypto/ripemd160"
+
+// RIPEMD-160 is designed by Hans Dobbertin, Antoon Bosselaers, and Bart
+// Preneel with specifications available at:
+// http://homes.esat.kuleuven.be/~cosicart/pdf/AB-9601/AB-9601.pdf.
+
+import (
+	"crypto"
+	"hash"
+)
+
+func init() {
+	crypto.RegisterHash(crypto.RIPEMD160, New)
+}
+
+// The size of the checksum in bytes.
+const Size = 20
+
+// The block size of the hash algorithm in bytes.
+const BlockSize = 64
+
+const (
+	_s0 = 0x67452301
+	_s1 = 0xefcdab89
+	_s2 = 0x98badcfe
+	_s3 = 0x10325476
+	_s4 = 0xc3d2e1f0
+)
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	s  [5]uint32       // running context
+	x  [BlockSize]byte // temporary buffer
+	nx int             // index into x
+	tc uint64          // total count of bytes processed
+}
+
+func (d *digest) Reset() {
+	d.s[0], d.s[1], d.s[2], d.s[3], d.s[4] = _s0, _s1, _s2, _s3, _s4
+	d.nx = 0
+	d.tc = 0
+}
+
+// New returns a new hash.Hash computing the checksum.
+func New() hash.Hash {
+	result := new(digest)
+	result.Reset()
+	return result
+}
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return BlockSize }
+
+func (d *digest) Write(p []byte) (nn int, err error) {
+	nn = len(p)
+	d.tc += uint64(nn)
+	if d.nx > 0 {
+		n := len(p)
+		if n > BlockSize-d.nx {
+			n = BlockSize - d.nx
+		}
+		for i := 0; i < n; i++ {
+			d.x[d.nx+i] = p[i]
+		}
+		d.nx += n
+		if d.nx == BlockSize {
+			_Block(d, d.x[0:])
+			d.nx = 0
+		}
+		p = p[n:]
+	}
+	n := _Block(d, p)
+	p = p[n:]
+	if len(p) > 0 {
+		d.nx = copy(d.x[:], p)
+	}
+	return
+}
+
+func (d0 *digest) Sum(in []byte) []byte {
+	// Make a copy of d0 so that caller can keep writing and summing.
+	d := *d0
+
+	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
+	tc := d.tc
+	var tmp [64]byte
+	tmp[0] = 0x80
+	if tc%64 < 56 {
+		d.Write(tmp[0 : 56-tc%64])
+	} else {
+		d.Write(tmp[0 : 64+56-tc%64])
+	}
+
+	// Length in bits.
+	tc <<= 3
+	for i := uint(0); i < 8; i++ {
+		tmp[i] = byte(tc >> (8 * i))
+	}
+	d.Write(tmp[0:8])
+
+	if d.nx != 0 {
+		panic("d.nx != 0")
+	}
+
+	var digest [Size]byte
+	for i, s := range d.s {
+		digest[i*4] = byte(s)
+		digest[i*4+1] = byte(s >> 8)
+		digest[i*4+2] = byte(s >> 16)
+		digest[i*4+3] = byte(s >> 24)
+	}
+
+	return append(in, digest[:]...)
+}
@@ -0,0 +1,165 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// RIPEMD-160 block step.
+// In its own file so that a faster assembly or C version
+// can be substituted easily.
+
+package ripemd160
+
+import (
+	"math/bits"
+)
+
+// work buffer indices and roll amounts for one line
+var _n = [80]uint{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8,
+	3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12,
+	1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2,
+	4, 0, 5, 9, 7, 12, 2, 10, 14, 1, 3, 8, 11, 6, 15, 13,
+}
+
+var _r = [80]uint{
+	11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8,
+	7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12,
+	11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5,
+	11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12,
+	9, 15, 5, 11, 6, 8, 13, 12, 5, 12, 13, 14, 11, 8, 5, 6,
+}
+
+// same for the other parallel one
+var n_ = [80]uint{
+	5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12,
+	6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2,
+	15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13,
+	8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14,
+	12, 15, 10, 4, 1, 5, 8, 7, 6, 2, 13, 14, 0, 3, 9, 11,
+}
+
+var r_ = [80]uint{
+	8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6,
+	9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11,
+	9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5,
+	15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8,
+	8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11,
+}
+
+func _Block(md *digest, p []byte) int {
+	n := 0
+	var x [16]uint32
+	var alpha, beta uint32
+	for len(p) >= BlockSize {
+		a, b, c, d, e := md.s[0], md.s[1], md.s[2], md.s[3], md.s[4]
+		aa, bb, cc, dd, ee := a, b, c, d, e
+		j := 0
+		for i := 0; i < 16; i++ {
+			x[i] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
+			j += 4
+		}
+
+		// round 1
+		i := 0
+		for i < 16 {
+			alpha = a + (b ^ c ^ d) + x[_n[i]]
+			s := int(_r[i])
+			alpha = bits.RotateLeft32(alpha, s) + e
+			beta = bits.RotateLeft32(c, 10)
+			a, b, c, d, e = e, alpha, b, beta, d
+
+			// parallel line
+			alpha = aa + (bb ^ (cc | ^dd)) + x[n_[i]] + 0x50a28be6
+			s = int(r_[i])
+			alpha = bits.RotateLeft32(alpha, s) + ee
+			beta = bits.RotateLeft32(cc, 10)
+			aa, bb, cc, dd, ee = ee, alpha, bb, beta, dd
+
+			i++
+		}
+
+		// round 2
+		for i < 32 {
+			alpha = a + (b&c | ^b&d) + x[_n[i]] + 0x5a827999
+			s := int(_r[i])
+			alpha = bits.RotateLeft32(alpha, s) + e
+			beta = bits.RotateLeft32(c, 10)
+			a, b, c, d, e = e, alpha, b, beta, d
+
+			// parallel line
+			alpha = aa + (bb&dd | cc&^dd) + x[n_[i]] + 0x5c4dd124
+			s = int(r_[i])
+			alpha = bits.RotateLeft32(alpha, s) + ee
+			beta = bits.RotateLeft32(cc, 10)
+			aa, bb, cc, dd, ee = ee, alpha, bb, beta, dd
+
+			i++
+		}
+
+		// round 3
+		for i < 48 {
+			alpha = a + (b | ^c ^ d) + x[_n[i]] + 0x6ed9eba1
+			s := int(_r[i])
+			alpha = bits.RotateLeft32(alpha, s) + e
+			beta = bits.RotateLeft32(c, 10)
+			a, b, c, d, e = e, alpha, b, beta, d
+
+			// parallel line
+			alpha = aa + (bb | ^cc ^ dd) + x[n_[i]] + 0x6d703ef3
+			s = int(r_[i])
+			alpha = bits.RotateLeft32(alpha, s) + ee
+			beta = bits.RotateLeft32(cc, 10)
+			aa, bb, cc, dd, ee = ee, alpha, bb, beta, dd
+
+			i++
+		}
+
+		// round 4
+		for i < 64 {
+			alpha = a + (b&d | c&^d) + x[_n[i]] + 0x8f1bbcdc
+			s := int(_r[i])
+			alpha = bits.RotateLeft32(alpha, s) + e
+			beta = bits.RotateLeft32(c, 10)
+			a, b, c, d, e = e, alpha, b, beta, d
+
+			// parallel line
+			alpha = aa + (bb&cc | ^bb&dd) + x[n_[i]] + 0x7a6d76e9
+			s = int(r_[i])
+			alpha = bits.RotateLeft32(alpha, s) + ee
+			beta = bits.RotateLeft32(cc, 10)
+			aa, bb, cc, dd, ee = ee, alpha, bb, beta, dd
+
+			i++
+		}
+
+		// round 5
+		for i < 80 {
+			alpha = a + (b ^ (c | ^d)) + x[_n[i]] + 0xa953fd4e
+			s := int(_r[i])
+			alpha = bits.RotateLeft32(alpha, s) + e
+			beta = bits.RotateLeft32(c, 10)
+			a, b, c, d, e = e, alpha, b, beta, d
+
+			// parallel line
+			alpha = aa + (bb ^ cc ^ dd) + x[n_[i]]
+			s = int(r_[i])
+			alpha = bits.RotateLeft32(alpha, s) + ee
+			beta = bits.RotateLeft32(cc, 10)
+			aa, bb, cc, dd, ee = ee, alpha, bb, beta, dd
+
+			i++
+		}
+
+		// combine results
+		dd += c + md.s[1]
+		md.s[1] = md.s[2] + d + ee
+		md.s[2] = md.s[3] + e + aa
+		md.s[3] = md.s[4] + a + bb
+		md.s[4] = md.s[0] + b + cc
+		md.s[0] = dd
+
+		p = p[BlockSize:]
+		n += BlockSize
+	}
+	return n
+}
@@ -0,0 +1,62 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package sha3 implements the SHA-3 fixed-output-length hash functions and
+// the SHAKE variable-output-length hash functions defined by FIPS-202.
+//
+// Both types of hash function use the "sponge" construction and the Keccak
+// permutation. For a detailed specification see http://keccak.noekeon.org/
+//
+// # Guidance
+//
+// If you aren't sure what function you need, use SHAKE256 with at least 64
+// bytes of output. The SHAKE instances are faster than the SHA3 instances;
+// the latter have to allocate memory to conform to the hash.Hash interface.
+//
+// If you need a secret-key MAC (message authentication code), prepend the
+// secret key to the input, hash with SHAKE256 and read at least 32 bytes of
+// output.
+//
+// # Security strengths
+//
+// The SHA3-x (x equals 224, 256, 384, or 512) functions have a security
+// strength against preimage attacks of x bits. Since they only produce "x"
+// bits of output, their collision-resistance is only "x/2" bits.
+//
+// The SHAKE-256 and -128 functions have a generic security strength of 256 and
+// 128 bits against all attacks, provided that at least 2x bits of their output
+// is used.  Requesting more than 64 or 32 bytes of output, respectively, does
+// not increase the collision-resistance of the SHAKE functions.
+//
+// # The sponge construction
+//
+// A sponge builds a pseudo-random function from a public pseudo-random
+// permutation, by applying the permutation to a state of "rate + capacity"
+// bytes, but hiding "capacity" of the bytes.
+//
+// A sponge starts out with a zero state. To hash an input using a sponge, up
+// to "rate" bytes of the input are XORed into the sponge's state. The sponge
+// is then "full" and the permutation is applied to "empty" it. This process is
+// repeated until all the input has been "absorbed". The input is then padded.
+// The digest is "squeezed" from the sponge in the same way, except that output
+// is copied out instead of input being XORed in.
+//
+// A sponge is parameterized by its generic security strength, which is equal
+// to half its capacity; capacity + rate is equal to the permutation's width.
+// Since the KeccakF-1600 permutation is 1600 bits (200 bytes) wide, this means
+// that the security strength of a sponge instance is equal to (1600 - bitrate) / 2.
+//
+// # Recommendations
+//
+// The SHAKE functions are recommended for most new uses. They can produce
+// output of arbitrary length. SHAKE256, with an output length of at least
+// 64 bytes, provides 256-bit security against all attacks.  The Keccak team
+// recommends it for most applications upgrading from SHA2-512. (NIST chose a
+// much stronger, but much slower, sponge instance for SHA3-512.)
+//
+// The SHA-3 functions are "drop-in" replacements for the SHA-2 functions.
+// They produce output of the same length, with the same security strengths
+// against all attacks. This means, in particular, that SHA3-256 only has
+// 128-bit collision resistance, because its output length is 32 bytes.
+package sha3 // import "golang.org/x/crypto/sha3"
@@ -0,0 +1,97 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+// This file provides functions for creating instances of the SHA-3
+// and SHAKE hash functions, as well as utility functions for hashing
+// bytes.
+
+import (
+	"hash"
+)
+
+// New224 creates a new SHA3-224 hash.
+// Its generic security strength is 224 bits against preimage attacks,
+// and 112 bits against collision attacks.
+func New224() hash.Hash {
+	if h := new224Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 144, outputLen: 28, dsbyte: 0x06}
+}
+
+// New256 creates a new SHA3-256 hash.
+// Its generic security strength is 256 bits against preimage attacks,
+// and 128 bits against collision attacks.
+func New256() hash.Hash {
+	if h := new256Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 136, outputLen: 32, dsbyte: 0x06}
+}
+
+// New384 creates a new SHA3-384 hash.
+// Its generic security strength is 384 bits against preimage attacks,
+// and 192 bits against collision attacks.
+func New384() hash.Hash {
+	if h := new384Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 104, outputLen: 48, dsbyte: 0x06}
+}
+
+// New512 creates a new SHA3-512 hash.
+// Its generic security strength is 512 bits against preimage attacks,
+// and 256 bits against collision attacks.
+func New512() hash.Hash {
+	if h := new512Asm(); h != nil {
+		return h
+	}
+	return &state{rate: 72, outputLen: 64, dsbyte: 0x06}
+}
+
+// NewLegacyKeccak256 creates a new Keccak-256 hash.
+//
+// Only use this function if you require compatibility with an existing cryptosystem
+// that uses non-standard padding. All other users should use New256 instead.
+func NewLegacyKeccak256() hash.Hash { return &state{rate: 136, outputLen: 32, dsbyte: 0x01} }
+
+// NewLegacyKeccak512 creates a new Keccak-512 hash.
+//
+// Only use this function if you require compatibility with an existing cryptosystem
+// that uses non-standard padding. All other users should use New512 instead.
+func NewLegacyKeccak512() hash.Hash { return &state{rate: 72, outputLen: 64, dsbyte: 0x01} }
+
+// Sum224 returns the SHA3-224 digest of the data.
+func Sum224(data []byte) (digest [28]byte) {
+	h := New224()
+	h.Write(data)
+	h.Sum(digest[:0])
+	return
+}
+
+// Sum256 returns the SHA3-256 digest of the data.
+func Sum256(data []byte) (digest [32]byte) {
+	h := New256()
+	h.Write(data)
+	h.Sum(digest[:0])
+	return
+}
+
+// Sum384 returns the SHA3-384 digest of the data.
+func Sum384(data []byte) (digest [48]byte) {
+	h := New384()
+	h.Write(data)
+	h.Sum(digest[:0])
+	return
+}
+
+// Sum512 returns the SHA3-512 digest of the data.
+func Sum512(data []byte) (digest [64]byte) {
+	h := New512()
+	h.Write(data)
+	h.Sum(digest[:0])
+	return
+}
@@ -0,0 +1,28 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !gc || purego || !s390x
+// +build !gc purego !s390x
+
+package sha3
+
+import (
+	"hash"
+)
+
+// new224Asm returns an assembly implementation of SHA3-224 if available,
+// otherwise it returns nil.
+func new224Asm() hash.Hash { return nil }
+
+// new256Asm returns an assembly implementation of SHA3-256 if available,
+// otherwise it returns nil.
+func new256Asm() hash.Hash { return nil }
+
+// new384Asm returns an assembly implementation of SHA3-384 if available,
+// otherwise it returns nil.
+func new384Asm() hash.Hash { return nil }
+
+// new512Asm returns an assembly implementation of SHA3-512 if available,
+// otherwise it returns nil.
+func new512Asm() hash.Hash { return nil }
@@ -0,0 +1,415 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || purego || !gc
+// +build !amd64 purego !gc
+
+package sha3
+
+import "math/bits"
+
+// rc stores the round constants for use in the ι step.
+var rc = [24]uint64{
+	0x0000000000000001,
+	0x0000000000008082,
+	0x800000000000808A,
+	0x8000000080008000,
+	0x000000000000808B,
+	0x0000000080000001,
+	0x8000000080008081,
+	0x8000000000008009,
+	0x000000000000008A,
+	0x0000000000000088,
+	0x0000000080008009,
+	0x000000008000000A,
+	0x000000008000808B,
+	0x800000000000008B,
+	0x8000000000008089,
+	0x8000000000008003,
+	0x8000000000008002,
+	0x8000000000000080,
+	0x000000000000800A,
+	0x800000008000000A,
+	0x8000000080008081,
+	0x8000000000008080,
+	0x0000000080000001,
+	0x8000000080008008,
+}
+
+// keccakF1600 applies the Keccak permutation to a 1600b-wide
+// state represented as a slice of 25 uint64s.
+func keccakF1600(a *[25]uint64) {
+	// Implementation translated from Keccak-inplace.c
+	// in the keccak reference code.
+	var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64
+
+	for i := 0; i < 24; i += 4 {
+		// Combines the 5 steps in each round into 2 steps.
+		// Unrolls 4 rounds per loop and spreads some steps across rounds.
+
+		// Round 1
+		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
+		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
+		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
+		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
+		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[6] ^ d1
+		bc1 = bits.RotateLeft64(t, 44)
+		t = a[12] ^ d2
+		bc2 = bits.RotateLeft64(t, 43)
+		t = a[18] ^ d3
+		bc3 = bits.RotateLeft64(t, 21)
+		t = a[24] ^ d4
+		bc4 = bits.RotateLeft64(t, 14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i]
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc2 = bits.RotateLeft64(t, 3)
+		t = a[16] ^ d1
+		bc3 = bits.RotateLeft64(t, 45)
+		t = a[22] ^ d2
+		bc4 = bits.RotateLeft64(t, 61)
+		t = a[3] ^ d3
+		bc0 = bits.RotateLeft64(t, 28)
+		t = a[9] ^ d4
+		bc1 = bits.RotateLeft64(t, 20)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc4 = bits.RotateLeft64(t, 18)
+		t = a[1] ^ d1
+		bc0 = bits.RotateLeft64(t, 1)
+		t = a[7] ^ d2
+		bc1 = bits.RotateLeft64(t, 6)
+		t = a[13] ^ d3
+		bc2 = bits.RotateLeft64(t, 25)
+		t = a[19] ^ d4
+		bc3 = bits.RotateLeft64(t, 8)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc1 = bits.RotateLeft64(t, 36)
+		t = a[11] ^ d1
+		bc2 = bits.RotateLeft64(t, 10)
+		t = a[17] ^ d2
+		bc3 = bits.RotateLeft64(t, 15)
+		t = a[23] ^ d3
+		bc4 = bits.RotateLeft64(t, 56)
+		t = a[4] ^ d4
+		bc0 = bits.RotateLeft64(t, 27)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc3 = bits.RotateLeft64(t, 41)
+		t = a[21] ^ d1
+		bc4 = bits.RotateLeft64(t, 2)
+		t = a[2] ^ d2
+		bc0 = bits.RotateLeft64(t, 62)
+		t = a[8] ^ d3
+		bc1 = bits.RotateLeft64(t, 55)
+		t = a[14] ^ d4
+		bc2 = bits.RotateLeft64(t, 39)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		// Round 2
+		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
+		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
+		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
+		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
+		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[16] ^ d1
+		bc1 = bits.RotateLeft64(t, 44)
+		t = a[7] ^ d2
+		bc2 = bits.RotateLeft64(t, 43)
+		t = a[23] ^ d3
+		bc3 = bits.RotateLeft64(t, 21)
+		t = a[14] ^ d4
+		bc4 = bits.RotateLeft64(t, 14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1]
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc2 = bits.RotateLeft64(t, 3)
+		t = a[11] ^ d1
+		bc3 = bits.RotateLeft64(t, 45)
+		t = a[2] ^ d2
+		bc4 = bits.RotateLeft64(t, 61)
+		t = a[18] ^ d3
+		bc0 = bits.RotateLeft64(t, 28)
+		t = a[9] ^ d4
+		bc1 = bits.RotateLeft64(t, 20)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc4 = bits.RotateLeft64(t, 18)
+		t = a[6] ^ d1
+		bc0 = bits.RotateLeft64(t, 1)
+		t = a[22] ^ d2
+		bc1 = bits.RotateLeft64(t, 6)
+		t = a[13] ^ d3
+		bc2 = bits.RotateLeft64(t, 25)
+		t = a[4] ^ d4
+		bc3 = bits.RotateLeft64(t, 8)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc1 = bits.RotateLeft64(t, 36)
+		t = a[1] ^ d1
+		bc2 = bits.RotateLeft64(t, 10)
+		t = a[17] ^ d2
+		bc3 = bits.RotateLeft64(t, 15)
+		t = a[8] ^ d3
+		bc4 = bits.RotateLeft64(t, 56)
+		t = a[24] ^ d4
+		bc0 = bits.RotateLeft64(t, 27)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc3 = bits.RotateLeft64(t, 41)
+		t = a[21] ^ d1
+		bc4 = bits.RotateLeft64(t, 2)
+		t = a[12] ^ d2
+		bc0 = bits.RotateLeft64(t, 62)
+		t = a[3] ^ d3
+		bc1 = bits.RotateLeft64(t, 55)
+		t = a[19] ^ d4
+		bc2 = bits.RotateLeft64(t, 39)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		// Round 3
+		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
+		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
+		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
+		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
+		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[11] ^ d1
+		bc1 = bits.RotateLeft64(t, 44)
+		t = a[22] ^ d2
+		bc2 = bits.RotateLeft64(t, 43)
+		t = a[8] ^ d3
+		bc3 = bits.RotateLeft64(t, 21)
+		t = a[19] ^ d4
+		bc4 = bits.RotateLeft64(t, 14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2]
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc2 = bits.RotateLeft64(t, 3)
+		t = a[1] ^ d1
+		bc3 = bits.RotateLeft64(t, 45)
+		t = a[12] ^ d2
+		bc4 = bits.RotateLeft64(t, 61)
+		t = a[23] ^ d3
+		bc0 = bits.RotateLeft64(t, 28)
+		t = a[9] ^ d4
+		bc1 = bits.RotateLeft64(t, 20)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc4 = bits.RotateLeft64(t, 18)
+		t = a[16] ^ d1
+		bc0 = bits.RotateLeft64(t, 1)
+		t = a[2] ^ d2
+		bc1 = bits.RotateLeft64(t, 6)
+		t = a[13] ^ d3
+		bc2 = bits.RotateLeft64(t, 25)
+		t = a[24] ^ d4
+		bc3 = bits.RotateLeft64(t, 8)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc1 = bits.RotateLeft64(t, 36)
+		t = a[6] ^ d1
+		bc2 = bits.RotateLeft64(t, 10)
+		t = a[17] ^ d2
+		bc3 = bits.RotateLeft64(t, 15)
+		t = a[3] ^ d3
+		bc4 = bits.RotateLeft64(t, 56)
+		t = a[14] ^ d4
+		bc0 = bits.RotateLeft64(t, 27)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc3 = bits.RotateLeft64(t, 41)
+		t = a[21] ^ d1
+		bc4 = bits.RotateLeft64(t, 2)
+		t = a[7] ^ d2
+		bc0 = bits.RotateLeft64(t, 62)
+		t = a[18] ^ d3
+		bc1 = bits.RotateLeft64(t, 55)
+		t = a[4] ^ d4
+		bc2 = bits.RotateLeft64(t, 39)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		// Round 4
+		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
+		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
+		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
+		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
+		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[1] ^ d1
+		bc1 = bits.RotateLeft64(t, 44)
+		t = a[2] ^ d2
+		bc2 = bits.RotateLeft64(t, 43)
+		t = a[3] ^ d3
+		bc3 = bits.RotateLeft64(t, 21)
+		t = a[4] ^ d4
+		bc4 = bits.RotateLeft64(t, 14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3]
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc2 = bits.RotateLeft64(t, 3)
+		t = a[6] ^ d1
+		bc3 = bits.RotateLeft64(t, 45)
+		t = a[7] ^ d2
+		bc4 = bits.RotateLeft64(t, 61)
+		t = a[8] ^ d3
+		bc0 = bits.RotateLeft64(t, 28)
+		t = a[9] ^ d4
+		bc1 = bits.RotateLeft64(t, 20)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc4 = bits.RotateLeft64(t, 18)
+		t = a[11] ^ d1
+		bc0 = bits.RotateLeft64(t, 1)
+		t = a[12] ^ d2
+		bc1 = bits.RotateLeft64(t, 6)
+		t = a[13] ^ d3
+		bc2 = bits.RotateLeft64(t, 25)
+		t = a[14] ^ d4
+		bc3 = bits.RotateLeft64(t, 8)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc1 = bits.RotateLeft64(t, 36)
+		t = a[16] ^ d1
+		bc2 = bits.RotateLeft64(t, 10)
+		t = a[17] ^ d2
+		bc3 = bits.RotateLeft64(t, 15)
+		t = a[18] ^ d3
+		bc4 = bits.RotateLeft64(t, 56)
+		t = a[19] ^ d4
+		bc0 = bits.RotateLeft64(t, 27)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc3 = bits.RotateLeft64(t, 41)
+		t = a[21] ^ d1
+		bc4 = bits.RotateLeft64(t, 2)
+		t = a[22] ^ d2
+		bc0 = bits.RotateLeft64(t, 62)
+		t = a[23] ^ d3
+		bc1 = bits.RotateLeft64(t, 55)
+		t = a[24] ^ d4
+		bc2 = bits.RotateLeft64(t, 39)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
+	}
+}
@@ -0,0 +1,14 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && !purego && gc
+// +build amd64,!purego,gc
+
+package sha3
+
+// This function is implemented in keccakf_amd64.s.
+
+//go:noescape
+
+func keccakF1600(a *[25]uint64)
@@ -0,0 +1,391 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && !purego && gc
+// +build amd64,!purego,gc
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources at https://github.com/gvanas/KeccakCodePackage
+
+// Offsets in state
+#define _ba  (0*8)
+#define _be  (1*8)
+#define _bi  (2*8)
+#define _bo  (3*8)
+#define _bu  (4*8)
+#define _ga  (5*8)
+#define _ge  (6*8)
+#define _gi  (7*8)
+#define _go  (8*8)
+#define _gu  (9*8)
+#define _ka (10*8)
+#define _ke (11*8)
+#define _ki (12*8)
+#define _ko (13*8)
+#define _ku (14*8)
+#define _ma (15*8)
+#define _me (16*8)
+#define _mi (17*8)
+#define _mo (18*8)
+#define _mu (19*8)
+#define _sa (20*8)
+#define _se (21*8)
+#define _si (22*8)
+#define _so (23*8)
+#define _su (24*8)
+
+// Temporary registers
+#define rT1  AX
+
+// Round vars
+#define rpState DI
+#define rpStack SP
+
+#define rDa BX
+#define rDe CX
+#define rDi DX
+#define rDo R8
+#define rDu R9
+
+#define rBa R10
+#define rBe R11
+#define rBi R12
+#define rBo R13
+#define rBu R14
+
+#define rCa SI
+#define rCe BP
+#define rCi rBi
+#define rCo rBo
+#define rCu R15
+
+#define MOVQ_RBI_RCE MOVQ rBi, rCe
+#define XORQ_RT1_RCA XORQ rT1, rCa
+#define XORQ_RT1_RCE XORQ rT1, rCe
+#define XORQ_RBA_RCU XORQ rBa, rCu
+#define XORQ_RBE_RCU XORQ rBe, rCu
+#define XORQ_RDU_RCU XORQ rDu, rCu
+#define XORQ_RDA_RCA XORQ rDa, rCa
+#define XORQ_RDE_RCE XORQ rDe, rCe
+
+#define mKeccakRound(iState, oState, rc, B_RBI_RCE, G_RT1_RCA, G_RT1_RCE, G_RBA_RCU, K_RT1_RCA, K_RT1_RCE, K_RBA_RCU, M_RT1_RCA, M_RT1_RCE, M_RBE_RCU, S_RDU_RCU, S_RDA_RCA, S_RDE_RCE) \
+	/* Prepare round */    \
+	MOVQ rCe, rDa;         \
+	ROLQ $1, rDa;          \
+	                       \
+	MOVQ _bi(iState), rCi; \
+	XORQ _gi(iState), rDi; \
+	XORQ rCu, rDa;         \
+	XORQ _ki(iState), rCi; \
+	XORQ _mi(iState), rDi; \
+	XORQ rDi, rCi;         \
+	                       \
+	MOVQ rCi, rDe;         \
+	ROLQ $1, rDe;          \
+	                       \
+	MOVQ _bo(iState), rCo; \
+	XORQ _go(iState), rDo; \
+	XORQ rCa, rDe;         \
+	XORQ _ko(iState), rCo; \
+	XORQ _mo(iState), rDo; \
+	XORQ rDo, rCo;         \
+	                       \
+	MOVQ rCo, rDi;         \
+	ROLQ $1, rDi;          \
+	                       \
+	MOVQ rCu, rDo;         \
+	XORQ rCe, rDi;         \
+	ROLQ $1, rDo;          \
+	                       \
+	MOVQ rCa, rDu;         \
+	XORQ rCi, rDo;         \
+	ROLQ $1, rDu;          \
+	                       \
+	/* Result b */         \
+	MOVQ _ba(iState), rBa; \
+	MOVQ _ge(iState), rBe; \
+	XORQ rCo, rDu;         \
+	MOVQ _ki(iState), rBi; \
+	MOVQ _mo(iState), rBo; \
+	MOVQ _su(iState), rBu; \
+	XORQ rDe, rBe;         \
+	ROLQ $44, rBe;         \
+	XORQ rDi, rBi;         \
+	XORQ rDa, rBa;         \
+	ROLQ $43, rBi;         \
+	                       \
+	MOVQ rBe, rCa;         \
+	MOVQ rc, rT1;          \
+	ORQ  rBi, rCa;         \
+	XORQ rBa, rT1;         \
+	XORQ rT1, rCa;         \
+	MOVQ rCa, _ba(oState); \
+	                       \
+	XORQ rDu, rBu;         \
+	ROLQ $14, rBu;         \
+	MOVQ rBa, rCu;         \
+	ANDQ rBe, rCu;         \
+	XORQ rBu, rCu;         \
+	MOVQ rCu, _bu(oState); \
+	                       \
+	XORQ rDo, rBo;         \
+	ROLQ $21, rBo;         \
+	MOVQ rBo, rT1;         \
+	ANDQ rBu, rT1;         \
+	XORQ rBi, rT1;         \
+	MOVQ rT1, _bi(oState); \
+	                       \
+	NOTQ rBi;              \
+	ORQ  rBa, rBu;         \
+	ORQ  rBo, rBi;         \
+	XORQ rBo, rBu;         \
+	XORQ rBe, rBi;         \
+	MOVQ rBu, _bo(oState); \
+	MOVQ rBi, _be(oState); \
+	B_RBI_RCE;             \
+	                       \
+	/* Result g */         \
+	MOVQ _gu(iState), rBe; \
+	XORQ rDu, rBe;         \
+	MOVQ _ka(iState), rBi; \
+	ROLQ $20, rBe;         \
+	XORQ rDa, rBi;         \
+	ROLQ $3, rBi;          \
+	MOVQ _bo(iState), rBa; \
+	MOVQ rBe, rT1;         \
+	ORQ  rBi, rT1;         \
+	XORQ rDo, rBa;         \
+	MOVQ _me(iState), rBo; \
+	MOVQ _si(iState), rBu; \
+	ROLQ $28, rBa;         \
+	XORQ rBa, rT1;         \
+	MOVQ rT1, _ga(oState); \
+	G_RT1_RCA;             \
+	                       \
+	XORQ rDe, rBo;         \
+	ROLQ $45, rBo;         \
+	MOVQ rBi, rT1;         \
+	ANDQ rBo, rT1;         \
+	XORQ rBe, rT1;         \
+	MOVQ rT1, _ge(oState); \
+	G_RT1_RCE;             \
+	                       \
+	XORQ rDi, rBu;         \
+	ROLQ $61, rBu;         \
+	MOVQ rBu, rT1;         \
+	ORQ  rBa, rT1;         \
+	XORQ rBo, rT1;         \
+	MOVQ rT1, _go(oState); \
+	                       \
+	ANDQ rBe, rBa;         \
+	XORQ rBu, rBa;         \
+	MOVQ rBa, _gu(oState); \
+	NOTQ rBu;              \
+	G_RBA_RCU;             \
+	                       \
+	ORQ  rBu, rBo;         \
+	XORQ rBi, rBo;         \
+	MOVQ rBo, _gi(oState); \
+	                       \
+	/* Result k */         \
+	MOVQ _be(iState), rBa; \
+	MOVQ _gi(iState), rBe; \
+	MOVQ _ko(iState), rBi; \
+	MOVQ _mu(iState), rBo; \
+	MOVQ _sa(iState), rBu; \
+	XORQ rDi, rBe;         \
+	ROLQ $6, rBe;          \
+	XORQ rDo, rBi;         \
+	ROLQ $25, rBi;         \
+	MOVQ rBe, rT1;         \
+	ORQ  rBi, rT1;         \
+	XORQ rDe, rBa;         \
+	ROLQ $1, rBa;          \
+	XORQ rBa, rT1;         \
+	MOVQ rT1, _ka(oState); \
+	K_RT1_RCA;             \
+	                       \
+	XORQ rDu, rBo;         \
+	ROLQ $8, rBo;          \
+	MOVQ rBi, rT1;         \
+	ANDQ rBo, rT1;         \
+	XORQ rBe, rT1;         \
+	MOVQ rT1, _ke(oState); \
+	K_RT1_RCE;             \
+	                       \
+	XORQ rDa, rBu;         \
+	ROLQ $18, rBu;         \
+	NOTQ rBo;              \
+	MOVQ rBo, rT1;         \
+	ANDQ rBu, rT1;         \
+	XORQ rBi, rT1;         \
+	MOVQ rT1, _ki(oState); \
+	                       \
+	MOVQ rBu, rT1;         \
+	ORQ  rBa, rT1;         \
+	XORQ rBo, rT1;         \
+	MOVQ rT1, _ko(oState); \
+	                       \
+	ANDQ rBe, rBa;         \
+	XORQ rBu, rBa;         \
+	MOVQ rBa, _ku(oState); \
+	K_RBA_RCU;             \
+	                       \
+	/* Result m */         \
+	MOVQ _ga(iState), rBe; \
+	XORQ rDa, rBe;         \
+	MOVQ _ke(iState), rBi; \
+	ROLQ $36, rBe;         \
+	XORQ rDe, rBi;         \
+	MOVQ _bu(iState), rBa; \
+	ROLQ $10, rBi;         \
+	MOVQ rBe, rT1;         \
+	MOVQ _mi(iState), rBo; \
+	ANDQ rBi, rT1;         \
+	XORQ rDu, rBa;         \
+	MOVQ _so(iState), rBu; \
+	ROLQ $27, rBa;         \
+	XORQ rBa, rT1;         \
+	MOVQ rT1, _ma(oState); \
+	M_RT1_RCA;             \
+	                       \
+	XORQ rDi, rBo;         \
+	ROLQ $15, rBo;         \
+	MOVQ rBi, rT1;         \
+	ORQ  rBo, rT1;         \
+	XORQ rBe, rT1;         \
+	MOVQ rT1, _me(oState); \
+	M_RT1_RCE;             \
+	                       \
+	XORQ rDo, rBu;         \
+	ROLQ $56, rBu;         \
+	NOTQ rBo;              \
+	MOVQ rBo, rT1;         \
+	ORQ  rBu, rT1;         \
+	XORQ rBi, rT1;         \
+	MOVQ rT1, _mi(oState); \
+	                       \
+	ORQ  rBa, rBe;         \
+	XORQ rBu, rBe;         \
+	MOVQ rBe, _mu(oState); \
+	                       \
+	ANDQ rBa, rBu;         \
+	XORQ rBo, rBu;         \
+	MOVQ rBu, _mo(oState); \
+	M_RBE_RCU;             \
+	                       \
+	/* Result s */         \
+	MOVQ _bi(iState), rBa; \
+	MOVQ _go(iState), rBe; \
+	MOVQ _ku(iState), rBi; \
+	XORQ rDi, rBa;         \
+	MOVQ _ma(iState), rBo; \
+	ROLQ $62, rBa;         \
+	XORQ rDo, rBe;         \
+	MOVQ _se(iState), rBu; \
+	ROLQ $55, rBe;         \
+	                       \
+	XORQ rDu, rBi;         \
+	MOVQ rBa, rDu;         \
+	XORQ rDe, rBu;         \
+	ROLQ $2, rBu;          \
+	ANDQ rBe, rDu;         \
+	XORQ rBu, rDu;         \
+	MOVQ rDu, _su(oState); \
+	                       \
+	ROLQ $39, rBi;         \
+	S_RDU_RCU;             \
+	NOTQ rBe;              \
+	XORQ rDa, rBo;         \
+	MOVQ rBe, rDa;         \
+	ANDQ rBi, rDa;         \
+	XORQ rBa, rDa;         \
+	MOVQ rDa, _sa(oState); \
+	S_RDA_RCA;             \
+	                       \
+	ROLQ $41, rBo;         \
+	MOVQ rBi, rDe;         \
+	ORQ  rBo, rDe;         \
+	XORQ rBe, rDe;         \
+	MOVQ rDe, _se(oState); \
+	S_RDE_RCE;             \
+	                       \
+	MOVQ rBo, rDi;         \
+	MOVQ rBu, rDo;         \
+	ANDQ rBu, rDi;         \
+	ORQ  rBa, rDo;         \
+	XORQ rBi, rDi;         \
+	XORQ rBo, rDo;         \
+	MOVQ rDi, _si(oState); \
+	MOVQ rDo, _so(oState)  \
+
+// func keccakF1600(state *[25]uint64)
+TEXT ·keccakF1600(SB), 0, $200-8
+	MOVQ state+0(FP), rpState
+
+	// Convert the user state into an internal state
+	NOTQ _be(rpState)
+	NOTQ _bi(rpState)
+	NOTQ _go(rpState)
+	NOTQ _ki(rpState)
+	NOTQ _mi(rpState)
+	NOTQ _sa(rpState)
+
+	// Execute the KeccakF permutation
+	MOVQ _ba(rpState), rCa
+	MOVQ _be(rpState), rCe
+	MOVQ _bu(rpState), rCu
+
+	XORQ _ga(rpState), rCa
+	XORQ _ge(rpState), rCe
+	XORQ _gu(rpState), rCu
+
+	XORQ _ka(rpState), rCa
+	XORQ _ke(rpState), rCe
+	XORQ _ku(rpState), rCu
+
+	XORQ _ma(rpState), rCa
+	XORQ _me(rpState), rCe
+	XORQ _mu(rpState), rCu
+
+	XORQ _sa(rpState), rCa
+	XORQ _se(rpState), rCe
+	MOVQ _si(rpState), rDi
+	MOVQ _so(rpState), rDo
+	XORQ _su(rpState), rCu
+
+	mKeccakRound(rpState, rpStack, $0x0000000000000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x0000000000008082, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x800000000000808a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x8000000080008000, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x000000000000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x8000000000008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x000000000000008a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x0000000000000088, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x0000000080008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x000000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x000000008000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x800000000000008b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x8000000000008089, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x8000000000008003, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x8000000000008002, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x8000000000000080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x000000000000800a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x800000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x8000000000008080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpState, rpStack, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE)
+	mKeccakRound(rpStack, rpState, $0x8000000080008008, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP)
+
+	// Revert the internal state to the user state
+	NOTQ _be(rpState)
+	NOTQ _bi(rpState)
+	NOTQ _go(rpState)
+	NOTQ _ki(rpState)
+	NOTQ _mi(rpState)
+	NOTQ _sa(rpState)
+
+	RET
@@ -0,0 +1,19 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.4
+// +build go1.4
+
+package sha3
+
+import (
+	"crypto"
+)
+
+func init() {
+	crypto.RegisterHash(crypto.SHA3_224, New224)
+	crypto.RegisterHash(crypto.SHA3_256, New256)
+	crypto.RegisterHash(crypto.SHA3_384, New384)
+	crypto.RegisterHash(crypto.SHA3_512, New512)
+}
@@ -0,0 +1,193 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+// spongeDirection indicates the direction bytes are flowing through the sponge.
+type spongeDirection int
+
+const (
+	// spongeAbsorbing indicates that the sponge is absorbing input.
+	spongeAbsorbing spongeDirection = iota
+	// spongeSqueezing indicates that the sponge is being squeezed.
+	spongeSqueezing
+)
+
+const (
+	// maxRate is the maximum size of the internal buffer. SHAKE-256
+	// currently needs the largest buffer.
+	maxRate = 168
+)
+
+type state struct {
+	// Generic sponge components.
+	a    [25]uint64 // main state of the hash
+	buf  []byte     // points into storage
+	rate int        // the number of bytes of state to use
+
+	// dsbyte contains the "domain separation" bits and the first bit of
+	// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
+	// SHA-3 and SHAKE functions by appending bitstrings to the message.
+	// Using a little-endian bit-ordering convention, these are "01" for SHA-3
+	// and "1111" for SHAKE, or 00000010b and 00001111b, respectively. Then the
+	// padding rule from section 5.1 is applied to pad the message to a multiple
+	// of the rate, which involves adding a "1" bit, zero or more "0" bits, and
+	// a final "1" bit. We merge the first "1" bit from the padding into dsbyte,
+	// giving 00000110b (0x06) and 00011111b (0x1f).
+	// [1] http://csrc.nist.gov/publications/drafts/fips-202/fips_202_draft.pdf
+	//     "Draft FIPS 202: SHA-3 Standard: Permutation-Based Hash and
+	//      Extendable-Output Functions (May 2014)"
+	dsbyte byte
+
+	storage storageBuf
+
+	// Specific to SHA-3 and SHAKE.
+	outputLen int             // the default output size in bytes
+	state     spongeDirection // whether the sponge is absorbing or squeezing
+}
+
+// BlockSize returns the rate of sponge underlying this hash function.
+func (d *state) BlockSize() int { return d.rate }
+
+// Size returns the output size of the hash function in bytes.
+func (d *state) Size() int { return d.outputLen }
+
+// Reset clears the internal state by zeroing the sponge state and
+// the byte buffer, and setting Sponge.state to absorbing.
+func (d *state) Reset() {
+	// Zero the permutation's state.
+	for i := range d.a {
+		d.a[i] = 0
+	}
+	d.state = spongeAbsorbing
+	d.buf = d.storage.asBytes()[:0]
+}
+
+func (d *state) clone() *state {
+	ret := *d
+	if ret.state == spongeAbsorbing {
+		ret.buf = ret.storage.asBytes()[:len(ret.buf)]
+	} else {
+		ret.buf = ret.storage.asBytes()[d.rate-cap(d.buf) : d.rate]
+	}
+
+	return &ret
+}
+
+// permute applies the KeccakF-1600 permutation. It handles
+// any input-output buffering.
+func (d *state) permute() {
+	switch d.state {
+	case spongeAbsorbing:
+		// If we're absorbing, we need to xor the input into the state
+		// before applying the permutation.
+		xorIn(d, d.buf)
+		d.buf = d.storage.asBytes()[:0]
+		keccakF1600(&d.a)
+	case spongeSqueezing:
+		// If we're squeezing, we need to apply the permutation before
+		// copying more output.
+		keccakF1600(&d.a)
+		d.buf = d.storage.asBytes()[:d.rate]
+		copyOut(d, d.buf)
+	}
+}
+
+// pads appends the domain separation bits in dsbyte, applies
+// the multi-bitrate 10..1 padding rule, and permutes the state.
+func (d *state) padAndPermute(dsbyte byte) {
+	if d.buf == nil {
+		d.buf = d.storage.asBytes()[:0]
+	}
+	// Pad with this instance's domain-separator bits. We know that there's
+	// at least one byte of space in d.buf because, if it were full,
+	// permute would have been called to empty it. dsbyte also contains the
+	// first one bit for the padding. See the comment in the state struct.
+	d.buf = append(d.buf, dsbyte)
+	zerosStart := len(d.buf)
+	d.buf = d.storage.asBytes()[:d.rate]
+	for i := zerosStart; i < d.rate; i++ {
+		d.buf[i] = 0
+	}
+	// This adds the final one bit for the padding. Because of the way that
+	// bits are numbered from the LSB upwards, the final bit is the MSB of
+	// the last byte.
+	d.buf[d.rate-1] ^= 0x80
+	// Apply the permutation
+	d.permute()
+	d.state = spongeSqueezing
+	d.buf = d.storage.asBytes()[:d.rate]
+	copyOut(d, d.buf)
+}
+
+// Write absorbs more data into the hash's state. It produces an error
+// if more data is written to the ShakeHash after writing
+func (d *state) Write(p []byte) (written int, err error) {
+	if d.state != spongeAbsorbing {
+		panic("sha3: write to sponge after read")
+	}
+	if d.buf == nil {
+		d.buf = d.storage.asBytes()[:0]
+	}
+	written = len(p)
+
+	for len(p) > 0 {
+		if len(d.buf) == 0 && len(p) >= d.rate {
+			// The fast path; absorb a full "rate" bytes of input and apply the permutation.
+			xorIn(d, p[:d.rate])
+			p = p[d.rate:]
+			keccakF1600(&d.a)
+		} else {
+			// The slow path; buffer the input until we can fill the sponge, and then xor it in.
+			todo := d.rate - len(d.buf)
+			if todo > len(p) {
+				todo = len(p)
+			}
+			d.buf = append(d.buf, p[:todo]...)
+			p = p[todo:]
+
+			// If the sponge is full, apply the permutation.
+			if len(d.buf) == d.rate {
+				d.permute()
+			}
+		}
+	}
+
+	return
+}
+
+// Read squeezes an arbitrary number of bytes from the sponge.
+func (d *state) Read(out []byte) (n int, err error) {
+	// If we're still absorbing, pad and apply the permutation.
+	if d.state == spongeAbsorbing {
+		d.padAndPermute(d.dsbyte)
+	}
+
+	n = len(out)
+
+	// Now, do the squeezing.
+	for len(out) > 0 {
+		n := copy(out, d.buf)
+		d.buf = d.buf[n:]
+		out = out[n:]
+
+		// Apply the permutation if we've squeezed the sponge dry.
+		if len(d.buf) == 0 {
+			d.permute()
+		}
+	}
+
+	return
+}
+
+// Sum applies padding to the hash state and then squeezes out the desired
+// number of output bytes.
+func (d *state) Sum(in []byte) []byte {
+	// Make a copy of the original hash so that caller can keep writing
+	// and summing.
+	dup := d.clone()
+	hash := make([]byte, dup.outputLen)
+	dup.Read(hash)
+	return append(in, hash...)
+}
@@ -0,0 +1,287 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+package sha3
+
+// This file contains code for using the 'compute intermediate
+// message digest' (KIMD) and 'compute last message digest' (KLMD)
+// instructions to compute SHA-3 and SHAKE hashes on IBM Z.
+
+import (
+	"hash"
+
+	"golang.org/x/sys/cpu"
+)
+
+// codes represent 7-bit KIMD/KLMD function codes as defined in
+// the Principles of Operation.
+type code uint64
+
+const (
+	// function codes for KIMD/KLMD
+	sha3_224  code = 32
+	sha3_256       = 33
+	sha3_384       = 34
+	sha3_512       = 35
+	shake_128      = 36
+	shake_256      = 37
+	nopad          = 0x100
+)
+
+// kimd is a wrapper for the 'compute intermediate message digest' instruction.
+// src must be a multiple of the rate for the given function code.
+//
+//go:noescape
+func kimd(function code, chain *[200]byte, src []byte)
+
+// klmd is a wrapper for the 'compute last message digest' instruction.
+// src padding is handled by the instruction.
+//
+//go:noescape
+func klmd(function code, chain *[200]byte, dst, src []byte)
+
+type asmState struct {
+	a         [200]byte       // 1600 bit state
+	buf       []byte          // care must be taken to ensure cap(buf) is a multiple of rate
+	rate      int             // equivalent to block size
+	storage   [3072]byte      // underlying storage for buf
+	outputLen int             // output length if fixed, 0 if not
+	function  code            // KIMD/KLMD function code
+	state     spongeDirection // whether the sponge is absorbing or squeezing
+}
+
+func newAsmState(function code) *asmState {
+	var s asmState
+	s.function = function
+	switch function {
+	case sha3_224:
+		s.rate = 144
+		s.outputLen = 28
+	case sha3_256:
+		s.rate = 136
+		s.outputLen = 32
+	case sha3_384:
+		s.rate = 104
+		s.outputLen = 48
+	case sha3_512:
+		s.rate = 72
+		s.outputLen = 64
+	case shake_128:
+		s.rate = 168
+	case shake_256:
+		s.rate = 136
+	default:
+		panic("sha3: unrecognized function code")
+	}
+
+	// limit s.buf size to a multiple of s.rate
+	s.resetBuf()
+	return &s
+}
+
+func (s *asmState) clone() *asmState {
+	c := *s
+	c.buf = c.storage[:len(s.buf):cap(s.buf)]
+	return &c
+}
+
+// copyIntoBuf copies b into buf. It will panic if there is not enough space to
+// store all of b.
+func (s *asmState) copyIntoBuf(b []byte) {
+	bufLen := len(s.buf)
+	s.buf = s.buf[:len(s.buf)+len(b)]
+	copy(s.buf[bufLen:], b)
+}
+
+// resetBuf points buf at storage, sets the length to 0 and sets cap to be a
+// multiple of the rate.
+func (s *asmState) resetBuf() {
+	max := (cap(s.storage) / s.rate) * s.rate
+	s.buf = s.storage[:0:max]
+}
+
+// Write (via the embedded io.Writer interface) adds more data to the running hash.
+// It never returns an error.
+func (s *asmState) Write(b []byte) (int, error) {
+	if s.state != spongeAbsorbing {
+		panic("sha3: write to sponge after read")
+	}
+	length := len(b)
+	for len(b) > 0 {
+		if len(s.buf) == 0 && len(b) >= cap(s.buf) {
+			// Hash the data directly and push any remaining bytes
+			// into the buffer.
+			remainder := len(b) % s.rate
+			kimd(s.function, &s.a, b[:len(b)-remainder])
+			if remainder != 0 {
+				s.copyIntoBuf(b[len(b)-remainder:])
+			}
+			return length, nil
+		}
+
+		if len(s.buf) == cap(s.buf) {
+			// flush the buffer
+			kimd(s.function, &s.a, s.buf)
+			s.buf = s.buf[:0]
+		}
+
+		// copy as much as we can into the buffer
+		n := len(b)
+		if len(b) > cap(s.buf)-len(s.buf) {
+			n = cap(s.buf) - len(s.buf)
+		}
+		s.copyIntoBuf(b[:n])
+		b = b[n:]
+	}
+	return length, nil
+}
+
+// Read squeezes an arbitrary number of bytes from the sponge.
+func (s *asmState) Read(out []byte) (n int, err error) {
+	n = len(out)
+
+	// need to pad if we were absorbing
+	if s.state == spongeAbsorbing {
+		s.state = spongeSqueezing
+
+		// write hash directly into out if possible
+		if len(out)%s.rate == 0 {
+			klmd(s.function, &s.a, out, s.buf) // len(out) may be 0
+			s.buf = s.buf[:0]
+			return
+		}
+
+		// write hash into buffer
+		max := cap(s.buf)
+		if max > len(out) {
+			max = (len(out)/s.rate)*s.rate + s.rate
+		}
+		klmd(s.function, &s.a, s.buf[:max], s.buf)
+		s.buf = s.buf[:max]
+	}
+
+	for len(out) > 0 {
+		// flush the buffer
+		if len(s.buf) != 0 {
+			c := copy(out, s.buf)
+			out = out[c:]
+			s.buf = s.buf[c:]
+			continue
+		}
+
+		// write hash directly into out if possible
+		if len(out)%s.rate == 0 {
+			klmd(s.function|nopad, &s.a, out, nil)
+			return
+		}
+
+		// write hash into buffer
+		s.resetBuf()
+		if cap(s.buf) > len(out) {
+			s.buf = s.buf[:(len(out)/s.rate)*s.rate+s.rate]
+		}
+		klmd(s.function|nopad, &s.a, s.buf, nil)
+	}
+	return
+}
+
+// Sum appends the current hash to b and returns the resulting slice.
+// It does not change the underlying hash state.
+func (s *asmState) Sum(b []byte) []byte {
+	if s.outputLen == 0 {
+		panic("sha3: cannot call Sum on SHAKE functions")
+	}
+
+	// Copy the state to preserve the original.
+	a := s.a
+
+	// Hash the buffer. Note that we don't clear it because we
+	// aren't updating the state.
+	klmd(s.function, &a, nil, s.buf)
+	return append(b, a[:s.outputLen]...)
+}
+
+// Reset resets the Hash to its initial state.
+func (s *asmState) Reset() {
+	for i := range s.a {
+		s.a[i] = 0
+	}
+	s.resetBuf()
+	s.state = spongeAbsorbing
+}
+
+// Size returns the number of bytes Sum will return.
+func (s *asmState) Size() int {
+	return s.outputLen
+}
+
+// BlockSize returns the hash's underlying block size.
+// The Write method must be able to accept any amount
+// of data, but it may operate more efficiently if all writes
+// are a multiple of the block size.
+func (s *asmState) BlockSize() int {
+	return s.rate
+}
+
+// Clone returns a copy of the ShakeHash in its current state.
+func (s *asmState) Clone() ShakeHash {
+	return s.clone()
+}
+
+// new224Asm returns an assembly implementation of SHA3-224 if available,
+// otherwise it returns nil.
+func new224Asm() hash.Hash {
+	if cpu.S390X.HasSHA3 {
+		return newAsmState(sha3_224)
+	}
+	return nil
+}
+
+// new256Asm returns an assembly implementation of SHA3-256 if available,
+// otherwise it returns nil.
+func new256Asm() hash.Hash {
+	if cpu.S390X.HasSHA3 {
+		return newAsmState(sha3_256)
+	}
+	return nil
+}
+
+// new384Asm returns an assembly implementation of SHA3-384 if available,
+// otherwise it returns nil.
+func new384Asm() hash.Hash {
+	if cpu.S390X.HasSHA3 {
+		return newAsmState(sha3_384)
+	}
+	return nil
+}
+
+// new512Asm returns an assembly implementation of SHA3-512 if available,
+// otherwise it returns nil.
+func new512Asm() hash.Hash {
+	if cpu.S390X.HasSHA3 {
+		return newAsmState(sha3_512)
+	}
+	return nil
+}
+
+// newShake128Asm returns an assembly implementation of SHAKE-128 if available,
+// otherwise it returns nil.
+func newShake128Asm() ShakeHash {
+	if cpu.S390X.HasSHA3 {
+		return newAsmState(shake_128)
+	}
+	return nil
+}
+
+// newShake256Asm returns an assembly implementation of SHAKE-256 if available,
+// otherwise it returns nil.
+func newShake256Asm() ShakeHash {
+	if cpu.S390X.HasSHA3 {
+		return newAsmState(shake_256)
+	}
+	return nil
+}
@@ -0,0 +1,34 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+// +build gc,!purego
+
+#include "textflag.h"
+
+// func kimd(function code, chain *[200]byte, src []byte)
+TEXT ·kimd(SB), NOFRAME|NOSPLIT, $0-40
+	MOVD function+0(FP), R0
+	MOVD chain+8(FP), R1
+	LMG  src+16(FP), R2, R3 // R2=base, R3=len
+
+continue:
+	WORD $0xB93E0002 // KIMD --, R2
+	BVS  continue    // continue if interrupted
+	MOVD $0, R0      // reset R0 for pre-go1.8 compilers
+	RET
+
+// func klmd(function code, chain *[200]byte, dst, src []byte)
+TEXT ·klmd(SB), NOFRAME|NOSPLIT, $0-64
+	// TODO: SHAKE support
+	MOVD function+0(FP), R0
+	MOVD chain+8(FP), R1
+	LMG  dst+16(FP), R2, R3 // R2=base, R3=len
+	LMG  src+40(FP), R4, R5 // R4=base, R5=len
+
+continue:
+	WORD $0xB93F0024 // KLMD R2, R4
+	BVS  continue    // continue if interrupted
+	MOVD $0, R0      // reset R0 for pre-go1.8 compilers
+	RET
@@ -0,0 +1,173 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+// This file defines the ShakeHash interface, and provides
+// functions for creating SHAKE and cSHAKE instances, as well as utility
+// functions for hashing bytes to arbitrary-length output.
+//
+//
+// SHAKE implementation is based on FIPS PUB 202 [1]
+// cSHAKE implementations is based on NIST SP 800-185 [2]
+//
+// [1] https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+// [2] https://doi.org/10.6028/NIST.SP.800-185
+
+import (
+	"encoding/binary"
+	"io"
+)
+
+// ShakeHash defines the interface to hash functions that
+// support arbitrary-length output.
+type ShakeHash interface {
+	// Write absorbs more data into the hash's state. It panics if input is
+	// written to it after output has been read from it.
+	io.Writer
+
+	// Read reads more output from the hash; reading affects the hash's
+	// state. (ShakeHash.Read is thus very different from Hash.Sum)
+	// It never returns an error.
+	io.Reader
+
+	// Clone returns a copy of the ShakeHash in its current state.
+	Clone() ShakeHash
+
+	// Reset resets the ShakeHash to its initial state.
+	Reset()
+}
+
+// cSHAKE specific context
+type cshakeState struct {
+	*state // SHA-3 state context and Read/Write operations
+
+	// initBlock is the cSHAKE specific initialization set of bytes. It is initialized
+	// by newCShake function and stores concatenation of N followed by S, encoded
+	// by the method specified in 3.3 of [1].
+	// It is stored here in order for Reset() to be able to put context into
+	// initial state.
+	initBlock []byte
+}
+
+// Consts for configuring initial SHA-3 state
+const (
+	dsbyteShake  = 0x1f
+	dsbyteCShake = 0x04
+	rate128      = 168
+	rate256      = 136
+)
+
+func bytepad(input []byte, w int) []byte {
+	// leftEncode always returns max 9 bytes
+	buf := make([]byte, 0, 9+len(input)+w)
+	buf = append(buf, leftEncode(uint64(w))...)
+	buf = append(buf, input...)
+	padlen := w - (len(buf) % w)
+	return append(buf, make([]byte, padlen)...)
+}
+
+func leftEncode(value uint64) []byte {
+	var b [9]byte
+	binary.BigEndian.PutUint64(b[1:], value)
+	// Trim all but last leading zero bytes
+	i := byte(1)
+	for i < 8 && b[i] == 0 {
+		i++
+	}
+	// Prepend number of encoded bytes
+	b[i-1] = 9 - i
+	return b[i-1:]
+}
+
+func newCShake(N, S []byte, rate int, dsbyte byte) ShakeHash {
+	c := cshakeState{state: &state{rate: rate, dsbyte: dsbyte}}
+
+	// leftEncode returns max 9 bytes
+	c.initBlock = make([]byte, 0, 9*2+len(N)+len(S))
+	c.initBlock = append(c.initBlock, leftEncode(uint64(len(N)*8))...)
+	c.initBlock = append(c.initBlock, N...)
+	c.initBlock = append(c.initBlock, leftEncode(uint64(len(S)*8))...)
+	c.initBlock = append(c.initBlock, S...)
+	c.Write(bytepad(c.initBlock, c.rate))
+	return &c
+}
+
+// Reset resets the hash to initial state.
+func (c *cshakeState) Reset() {
+	c.state.Reset()
+	c.Write(bytepad(c.initBlock, c.rate))
+}
+
+// Clone returns copy of a cSHAKE context within its current state.
+func (c *cshakeState) Clone() ShakeHash {
+	b := make([]byte, len(c.initBlock))
+	copy(b, c.initBlock)
+	return &cshakeState{state: c.clone(), initBlock: b}
+}
+
+// Clone returns copy of SHAKE context within its current state.
+func (c *state) Clone() ShakeHash {
+	return c.clone()
+}
+
+// NewShake128 creates a new SHAKE128 variable-output-length ShakeHash.
+// Its generic security strength is 128 bits against all attacks if at
+// least 32 bytes of its output are used.
+func NewShake128() ShakeHash {
+	if h := newShake128Asm(); h != nil {
+		return h
+	}
+	return &state{rate: rate128, dsbyte: dsbyteShake}
+}
+
+// NewShake256 creates a new SHAKE256 variable-output-length ShakeHash.
+// Its generic security strength is 256 bits against all attacks if
+// at least 64 bytes of its output are used.
+func NewShake256() ShakeHash {
+	if h := newShake256Asm(); h != nil {
+		return h
+	}
+	return &state{rate: rate256, dsbyte: dsbyteShake}
+}
+
+// NewCShake128 creates a new instance of cSHAKE128 variable-output-length ShakeHash,
+// a customizable variant of SHAKE128.
+// N is used to define functions based on cSHAKE, it can be empty when plain cSHAKE is
+// desired. S is a customization byte string used for domain separation - two cSHAKE
+// computations on same input with different S yield unrelated outputs.
+// When N and S are both empty, this is equivalent to NewShake128.
+func NewCShake128(N, S []byte) ShakeHash {
+	if len(N) == 0 && len(S) == 0 {
+		return NewShake128()
+	}
+	return newCShake(N, S, rate128, dsbyteCShake)
+}
+
+// NewCShake256 creates a new instance of cSHAKE256 variable-output-length ShakeHash,
+// a customizable variant of SHAKE256.
+// N is used to define functions based on cSHAKE, it can be empty when plain cSHAKE is
+// desired. S is a customization byte string used for domain separation - two cSHAKE
+// computations on same input with different S yield unrelated outputs.
+// When N and S are both empty, this is equivalent to NewShake256.
+func NewCShake256(N, S []byte) ShakeHash {
+	if len(N) == 0 && len(S) == 0 {
+		return NewShake256()
+	}
+	return newCShake(N, S, rate256, dsbyteCShake)
+}
+
+// ShakeSum128 writes an arbitrary-length digest of data into hash.
+func ShakeSum128(hash, data []byte) {
+	h := NewShake128()
+	h.Write(data)
+	h.Read(hash)
+}
+
+// ShakeSum256 writes an arbitrary-length digest of data into hash.
+func ShakeSum256(hash, data []byte) {
+	h := NewShake256()
+	h.Write(data)
+	h.Read(hash)
+}
@@ -0,0 +1,20 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !gc || purego || !s390x
+// +build !gc purego !s390x
+
+package sha3
+
+// newShake128Asm returns an assembly implementation of SHAKE-128 if available,
+// otherwise it returns nil.
+func newShake128Asm() ShakeHash {
+	return nil
+}
+
+// newShake256Asm returns an assembly implementation of SHAKE-256 if available,
+// otherwise it returns nil.
+func newShake256Asm() ShakeHash {
+	return nil
+}
@@ -0,0 +1,24 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !386 && !ppc64le) || purego
+// +build !amd64,!386,!ppc64le purego
+
+package sha3
+
+// A storageBuf is an aligned array of maxRate bytes.
+type storageBuf [maxRate]byte
+
+func (b *storageBuf) asBytes() *[maxRate]byte {
+	return (*[maxRate]byte)(b)
+}
+
+var (
+	xorIn            = xorInGeneric
+	copyOut          = copyOutGeneric
+	xorInUnaligned   = xorInGeneric
+	copyOutUnaligned = copyOutGeneric
+)
+
+const xorImplementationUnaligned = "generic"
@@ -0,0 +1,28 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+import "encoding/binary"
+
+// xorInGeneric xors the bytes in buf into the state; it
+// makes no non-portable assumptions about memory layout
+// or alignment.
+func xorInGeneric(d *state, buf []byte) {
+	n := len(buf) / 8
+
+	for i := 0; i < n; i++ {
+		a := binary.LittleEndian.Uint64(buf)
+		d.a[i] ^= a
+		buf = buf[8:]
+	}
+}
+
+// copyOutGeneric copies uint64s to a byte buffer.
+func copyOutGeneric(d *state, b []byte) {
+	for i := 0; len(b) >= 8; i++ {
+		binary.LittleEndian.PutUint64(b, d.a[i])
+		b = b[8:]
+	}
+}
@@ -0,0 +1,68 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (amd64 || 386 || ppc64le) && !purego
+// +build amd64 386 ppc64le
+// +build !purego
+
+package sha3
+
+import "unsafe"
+
+// A storageBuf is an aligned array of maxRate bytes.
+type storageBuf [maxRate / 8]uint64
+
+func (b *storageBuf) asBytes() *[maxRate]byte {
+	return (*[maxRate]byte)(unsafe.Pointer(b))
+}
+
+// xorInUnaligned uses unaligned reads and writes to update d.a to contain d.a
+// XOR buf.
+func xorInUnaligned(d *state, buf []byte) {
+	n := len(buf)
+	bw := (*[maxRate / 8]uint64)(unsafe.Pointer(&buf[0]))[: n/8 : n/8]
+	if n >= 72 {
+		d.a[0] ^= bw[0]
+		d.a[1] ^= bw[1]
+		d.a[2] ^= bw[2]
+		d.a[3] ^= bw[3]
+		d.a[4] ^= bw[4]
+		d.a[5] ^= bw[5]
+		d.a[6] ^= bw[6]
+		d.a[7] ^= bw[7]
+		d.a[8] ^= bw[8]
+	}
+	if n >= 104 {
+		d.a[9] ^= bw[9]
+		d.a[10] ^= bw[10]
+		d.a[11] ^= bw[11]
+		d.a[12] ^= bw[12]
+	}
+	if n >= 136 {
+		d.a[13] ^= bw[13]
+		d.a[14] ^= bw[14]
+		d.a[15] ^= bw[15]
+		d.a[16] ^= bw[16]
+	}
+	if n >= 144 {
+		d.a[17] ^= bw[17]
+	}
+	if n >= 168 {
+		d.a[18] ^= bw[18]
+		d.a[19] ^= bw[19]
+		d.a[20] ^= bw[20]
+	}
+}
+
+func copyOutUnaligned(d *state, buf []byte) {
+	ab := (*[maxRate]uint8)(unsafe.Pointer(&d.a[0]))
+	copy(buf, ab[:])
+}
+
+var (
+	xorIn   = xorInUnaligned
+	copyOut = copyOutUnaligned
+)
+
+const xorImplementationUnaligned = "unaligned"