Use github.com/awnumar/memguard to protect de-obfuscated embedded ssh private key and write logging to file when VMK is set

This commit is contained in:
2024-07-31 20:37:16 +02:00
parent 36c7e6843f
commit 7784fa8e4a
51 changed files with 6770 additions and 14 deletions

291
vendor/golang.org/x/crypto/blake2b/blake2b.go generated vendored Normal file
View File

@ -0,0 +1,291 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package blake2b implements the BLAKE2b hash algorithm defined by RFC 7693
// and the extendable output function (XOF) BLAKE2Xb.
//
// BLAKE2b is optimized for 64-bit platforms—including NEON-enabled ARMs—and
// produces digests of any size between 1 and 64 bytes.
// For a detailed specification of BLAKE2b see https://blake2.net/blake2.pdf
// and for BLAKE2Xb see https://blake2.net/blake2x.pdf
//
// If you aren't sure which function you need, use BLAKE2b (Sum512 or New512).
// If you need a secret-key MAC (message authentication code), use the New512
// function with a non-nil key.
//
// BLAKE2X is a construction to compute hash values larger than 64 bytes. It
// can produce hash values between 0 and 4 GiB.
package blake2b
import (
"encoding/binary"
"errors"
"hash"
)
const (
// The blocksize of BLAKE2b in bytes.
BlockSize = 128
// The hash size of BLAKE2b-512 in bytes.
Size = 64
// The hash size of BLAKE2b-384 in bytes.
Size384 = 48
// The hash size of BLAKE2b-256 in bytes.
Size256 = 32
)
var (
useAVX2 bool
useAVX bool
useSSE4 bool
)
var (
errKeySize = errors.New("blake2b: invalid key size")
errHashSize = errors.New("blake2b: invalid hash size")
)
var iv = [8]uint64{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
}
// Sum512 returns the BLAKE2b-512 checksum of the data.
func Sum512(data []byte) [Size]byte {
var sum [Size]byte
checkSum(&sum, Size, data)
return sum
}
// Sum384 returns the BLAKE2b-384 checksum of the data.
func Sum384(data []byte) [Size384]byte {
var sum [Size]byte
var sum384 [Size384]byte
checkSum(&sum, Size384, data)
copy(sum384[:], sum[:Size384])
return sum384
}
// Sum256 returns the BLAKE2b-256 checksum of the data.
func Sum256(data []byte) [Size256]byte {
var sum [Size]byte
var sum256 [Size256]byte
checkSum(&sum, Size256, data)
copy(sum256[:], sum[:Size256])
return sum256
}
// New512 returns a new hash.Hash computing the BLAKE2b-512 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New512(key []byte) (hash.Hash, error) { return newDigest(Size, key) }
// New384 returns a new hash.Hash computing the BLAKE2b-384 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New384(key []byte) (hash.Hash, error) { return newDigest(Size384, key) }
// New256 returns a new hash.Hash computing the BLAKE2b-256 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) }
// New returns a new hash.Hash computing the BLAKE2b checksum with a custom length.
// A non-nil key turns the hash into a MAC. The key must be between zero and 64 bytes long.
// The hash size can be a value between 1 and 64 but it is highly recommended to use
// values equal or greater than:
// - 32 if BLAKE2b is used as a hash function (The key is zero bytes long).
// - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long).
// When the key is nil, the returned hash.Hash implements BinaryMarshaler
// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) }
func newDigest(hashSize int, key []byte) (*digest, error) {
if hashSize < 1 || hashSize > Size {
return nil, errHashSize
}
if len(key) > Size {
return nil, errKeySize
}
d := &digest{
size: hashSize,
keyLen: len(key),
}
copy(d.key[:], key)
d.Reset()
return d, nil
}
func checkSum(sum *[Size]byte, hashSize int, data []byte) {
h := iv
h[0] ^= uint64(hashSize) | (1 << 16) | (1 << 24)
var c [2]uint64
if length := len(data); length > BlockSize {
n := length &^ (BlockSize - 1)
if length == n {
n -= BlockSize
}
hashBlocks(&h, &c, 0, data[:n])
data = data[n:]
}
var block [BlockSize]byte
offset := copy(block[:], data)
remaining := uint64(BlockSize - offset)
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
for i, v := range h[:(hashSize+7)/8] {
binary.LittleEndian.PutUint64(sum[8*i:], v)
}
}
type digest struct {
h [8]uint64
c [2]uint64
size int
block [BlockSize]byte
offset int
key [BlockSize]byte
keyLen int
}
const (
magic = "b2b"
marshaledSize = len(magic) + 8*8 + 2*8 + 1 + BlockSize + 1
)
func (d *digest) MarshalBinary() ([]byte, error) {
if d.keyLen != 0 {
return nil, errors.New("crypto/blake2b: cannot marshal MACs")
}
b := make([]byte, 0, marshaledSize)
b = append(b, magic...)
for i := 0; i < 8; i++ {
b = appendUint64(b, d.h[i])
}
b = appendUint64(b, d.c[0])
b = appendUint64(b, d.c[1])
// Maximum value for size is 64
b = append(b, byte(d.size))
b = append(b, d.block[:]...)
b = append(b, byte(d.offset))
return b, nil
}
func (d *digest) UnmarshalBinary(b []byte) error {
if len(b) < len(magic) || string(b[:len(magic)]) != magic {
return errors.New("crypto/blake2b: invalid hash state identifier")
}
if len(b) != marshaledSize {
return errors.New("crypto/blake2b: invalid hash state size")
}
b = b[len(magic):]
for i := 0; i < 8; i++ {
b, d.h[i] = consumeUint64(b)
}
b, d.c[0] = consumeUint64(b)
b, d.c[1] = consumeUint64(b)
d.size = int(b[0])
b = b[1:]
copy(d.block[:], b[:BlockSize])
b = b[BlockSize:]
d.offset = int(b[0])
return nil
}
func (d *digest) BlockSize() int { return BlockSize }
func (d *digest) Size() int { return d.size }
func (d *digest) Reset() {
d.h = iv
d.h[0] ^= uint64(d.size) | (uint64(d.keyLen) << 8) | (1 << 16) | (1 << 24)
d.offset, d.c[0], d.c[1] = 0, 0, 0
if d.keyLen > 0 {
d.block = d.key
d.offset = BlockSize
}
}
func (d *digest) Write(p []byte) (n int, err error) {
n = len(p)
if d.offset > 0 {
remaining := BlockSize - d.offset
if n <= remaining {
d.offset += copy(d.block[d.offset:], p)
return
}
copy(d.block[d.offset:], p[:remaining])
hashBlocks(&d.h, &d.c, 0, d.block[:])
d.offset = 0
p = p[remaining:]
}
if length := len(p); length > BlockSize {
nn := length &^ (BlockSize - 1)
if length == nn {
nn -= BlockSize
}
hashBlocks(&d.h, &d.c, 0, p[:nn])
p = p[nn:]
}
if len(p) > 0 {
d.offset += copy(d.block[:], p)
}
return
}
func (d *digest) Sum(sum []byte) []byte {
var hash [Size]byte
d.finalize(&hash)
return append(sum, hash[:d.size]...)
}
func (d *digest) finalize(hash *[Size]byte) {
var block [BlockSize]byte
copy(block[:], d.block[:d.offset])
remaining := uint64(BlockSize - d.offset)
c := d.c
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
h := d.h
hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
for i, v := range h {
binary.LittleEndian.PutUint64(hash[8*i:], v)
}
}
func appendUint64(b []byte, x uint64) []byte {
var a [8]byte
binary.BigEndian.PutUint64(a[:], x)
return append(b, a[:]...)
}
func appendUint32(b []byte, x uint32) []byte {
var a [4]byte
binary.BigEndian.PutUint32(a[:], x)
return append(b, a[:]...)
}
func consumeUint64(b []byte) ([]byte, uint64) {
x := binary.BigEndian.Uint64(b)
return b[8:], x
}
func consumeUint32(b []byte) ([]byte, uint32) {
x := binary.BigEndian.Uint32(b)
return b[4:], x
}

View File

@ -0,0 +1,37 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
package blake2b
import "golang.org/x/sys/cpu"
func init() {
useAVX2 = cpu.X86.HasAVX2
useAVX = cpu.X86.HasAVX
useSSE4 = cpu.X86.HasSSE41
}
//go:noescape
func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
//go:noescape
func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
//go:noescape
func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
switch {
case useAVX2:
hashBlocksAVX2(h, c, flag, blocks)
case useAVX:
hashBlocksAVX(h, c, flag, blocks)
case useSSE4:
hashBlocksSSE4(h, c, flag, blocks)
default:
hashBlocksGeneric(h, c, flag, blocks)
}
}

744
vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s generated vendored Normal file
View File

@ -0,0 +1,744 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
#include "textflag.h"
DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
VPADDQ m0, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFD $-79, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPSHUFB c40, Y1, Y1; \
VPADDQ m1, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFB c48, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPADDQ Y1, Y1, t; \
VPSRLQ $63, Y1, Y1; \
VPXOR t, Y1, Y1; \
VPERMQ_0x39_Y1_Y1; \
VPERMQ_0x4E_Y2_Y2; \
VPERMQ_0x93_Y3_Y3; \
VPADDQ m2, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFD $-79, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPSHUFB c40, Y1, Y1; \
VPADDQ m3, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFB c48, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPADDQ Y1, Y1, t; \
VPSRLQ $63, Y1, Y1; \
VPXOR t, Y1, Y1; \
VPERMQ_0x39_Y3_Y3; \
VPERMQ_0x4E_Y2_Y2; \
VPERMQ_0x93_Y1_Y1
#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
// load msg: Y12 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
VMOVQ_SI_X12(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X12(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y12, Y12
// load msg: Y13 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
VMOVQ_SI_X13(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X13(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y13, Y13
// load msg: Y14 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
VMOVQ_SI_X14(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X14(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y14, Y14
// load msg: Y15 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
VMOVQ_SI_X15(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X15(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
VMOVQ_SI_X12_0; \
VMOVQ_SI_X11(4*8); \
VPINSRQ_1_SI_X12(2*8); \
VPINSRQ_1_SI_X11(6*8); \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \
LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
VMOVQ_SI_X11(11*8); \
VPSHUFD $0x4E, 0*8(SI), X14; \
VPINSRQ_1_SI_X11(5*8); \
VINSERTI128 $1, X11, Y14, Y14; \
LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
VMOVQ_SI_X11(5*8); \
VMOVDQU 11*8(SI), X12; \
VPINSRQ_1_SI_X11(15*8); \
VINSERTI128 $1, X11, Y12, Y12; \
VMOVQ_SI_X13(8*8); \
VMOVQ_SI_X11(2*8); \
VPINSRQ_1_SI_X13_0; \
VPINSRQ_1_SI_X11(13*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \
VMOVQ_SI_X15(6*8); \
VMOVQ_SI_X11_0; \
VPINSRQ_1_SI_X15(10*8); \
VPINSRQ_1_SI_X11(8*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \
VMOVQ_SI_X13_0; \
VMOVQ_SI_X11(4*8); \
VPINSRQ_1_SI_X13(7*8); \
VPINSRQ_1_SI_X11(15*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X11_0; \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X11(8*8); \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \
LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \
LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
VMOVQ_SI_X14_0; \
VPSHUFD $0x4E, 8*8(SI), X11; \
VPINSRQ_1_SI_X14(6*8); \
VINSERTI128 $1, X11, Y14, Y14; \
LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \
VMOVQ_SI_X15_0; \
VMOVQ_SI_X11(6*8); \
VPINSRQ_1_SI_X15(4*8); \
VPINSRQ_1_SI_X11(10*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
VMOVQ_SI_X12(6*8); \
VMOVQ_SI_X11(11*8); \
VPINSRQ_1_SI_X12(14*8); \
VPINSRQ_1_SI_X11_0; \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
VMOVQ_SI_X11(1*8); \
VMOVDQU 12*8(SI), X14; \
VPINSRQ_1_SI_X11(10*8); \
VINSERTI128 $1, X11, Y14, Y14; \
VMOVQ_SI_X15(2*8); \
VMOVDQU 4*8(SI), X11; \
VPINSRQ_1_SI_X15(7*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \
VMOVQ_SI_X13(2*8); \
VPSHUFD $0x4E, 5*8(SI), X11; \
VPINSRQ_1_SI_X13(4*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
VMOVQ_SI_X15(11*8); \
VMOVQ_SI_X11(12*8); \
VPINSRQ_1_SI_X15(14*8); \
VPINSRQ_1_SI_X11_0; \
VINSERTI128 $1, X11, Y15, Y15
// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, DX
ADDQ $31, DX
ANDQ $~31, DX
MOVQ CX, 16(DX)
XORQ CX, CX
MOVQ CX, 24(DX)
VMOVDQU ·AVX2_c40<>(SB), Y4
VMOVDQU ·AVX2_c48<>(SB), Y5
VMOVDQU 0(AX), Y8
VMOVDQU 32(AX), Y9
VMOVDQU ·AVX2_iv0<>(SB), Y6
VMOVDQU ·AVX2_iv1<>(SB), Y7
MOVQ 0(BX), R8
MOVQ 8(BX), R9
MOVQ R9, 8(DX)
loop:
ADDQ $128, R8
MOVQ R8, 0(DX)
CMPQ R8, $128
JGE noinc
INCQ R9
MOVQ R9, 8(DX)
noinc:
VMOVDQA Y8, Y0
VMOVDQA Y9, Y1
VMOVDQA Y6, Y2
VPXOR 0(DX), Y7, Y3
LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
VMOVDQA Y12, 32(DX)
VMOVDQA Y13, 64(DX)
VMOVDQA Y14, 96(DX)
VMOVDQA Y15, 128(DX)
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
VMOVDQA Y12, 160(DX)
VMOVDQA Y13, 192(DX)
VMOVDQA Y14, 224(DX)
VMOVDQA Y15, 256(DX)
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
VPXOR Y0, Y8, Y8
VPXOR Y1, Y9, Y9
VPXOR Y2, Y8, Y8
VPXOR Y3, Y9, Y9
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
VMOVDQU Y8, 0(AX)
VMOVDQU Y9, 32(AX)
VZEROUPPER
RET
#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
#define SHUFFLE_AVX() \
VMOVDQA X6, X13; \
VMOVDQA X2, X14; \
VMOVDQA X4, X6; \
VPUNPCKLQDQ_X13_X13_X15; \
VMOVDQA X5, X4; \
VMOVDQA X6, X5; \
VPUNPCKHQDQ_X15_X7_X6; \
VPUNPCKLQDQ_X7_X7_X15; \
VPUNPCKHQDQ_X15_X13_X7; \
VPUNPCKLQDQ_X3_X3_X15; \
VPUNPCKHQDQ_X15_X2_X2; \
VPUNPCKLQDQ_X14_X14_X15; \
VPUNPCKHQDQ_X15_X3_X3; \
#define SHUFFLE_AVX_INV() \
VMOVDQA X2, X13; \
VMOVDQA X4, X14; \
VPUNPCKLQDQ_X2_X2_X15; \
VMOVDQA X5, X4; \
VPUNPCKHQDQ_X15_X3_X2; \
VMOVDQA X14, X5; \
VPUNPCKLQDQ_X3_X3_X15; \
VMOVDQA X6, X14; \
VPUNPCKHQDQ_X15_X13_X3; \
VPUNPCKLQDQ_X7_X7_X15; \
VPUNPCKHQDQ_X15_X6_X6; \
VPUNPCKLQDQ_X14_X14_X15; \
VPUNPCKHQDQ_X15_X7_X7; \
#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
VPADDQ m0, v0, v0; \
VPADDQ v2, v0, v0; \
VPADDQ m1, v1, v1; \
VPADDQ v3, v1, v1; \
VPXOR v0, v6, v6; \
VPXOR v1, v7, v7; \
VPSHUFD $-79, v6, v6; \
VPSHUFD $-79, v7, v7; \
VPADDQ v6, v4, v4; \
VPADDQ v7, v5, v5; \
VPXOR v4, v2, v2; \
VPXOR v5, v3, v3; \
VPSHUFB c40, v2, v2; \
VPSHUFB c40, v3, v3; \
VPADDQ m2, v0, v0; \
VPADDQ v2, v0, v0; \
VPADDQ m3, v1, v1; \
VPADDQ v3, v1, v1; \
VPXOR v0, v6, v6; \
VPXOR v1, v7, v7; \
VPSHUFB c48, v6, v6; \
VPSHUFB c48, v7, v7; \
VPADDQ v6, v4, v4; \
VPADDQ v7, v5, v5; \
VPXOR v4, v2, v2; \
VPXOR v5, v3, v3; \
VPADDQ v2, v2, t0; \
VPSRLQ $63, v2, v2; \
VPXOR t0, v2, v2; \
VPADDQ v3, v3, t0; \
VPSRLQ $63, v3, v3; \
VPXOR t0, v3, v3
// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
VMOVQ_SI_X12(i0*8); \
VMOVQ_SI_X13(i2*8); \
VMOVQ_SI_X14(i4*8); \
VMOVQ_SI_X15(i6*8); \
VPINSRQ_1_SI_X12(i1*8); \
VPINSRQ_1_SI_X13(i3*8); \
VPINSRQ_1_SI_X14(i5*8); \
VPINSRQ_1_SI_X15(i7*8)
// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
VMOVQ_SI_X12_0; \
VMOVQ_SI_X13(4*8); \
VMOVQ_SI_X14(1*8); \
VMOVQ_SI_X15(5*8); \
VPINSRQ_1_SI_X12(2*8); \
VPINSRQ_1_SI_X13(6*8); \
VPINSRQ_1_SI_X14(3*8); \
VPINSRQ_1_SI_X15(7*8)
// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
VPSHUFD $0x4E, 0*8(SI), X12; \
VMOVQ_SI_X13(11*8); \
VMOVQ_SI_X14(12*8); \
VMOVQ_SI_X15(7*8); \
VPINSRQ_1_SI_X13(5*8); \
VPINSRQ_1_SI_X14(2*8); \
VPINSRQ_1_SI_X15(3*8)
// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
VMOVDQU 11*8(SI), X12; \
VMOVQ_SI_X13(5*8); \
VMOVQ_SI_X14(8*8); \
VMOVQ_SI_X15(2*8); \
VPINSRQ_1_SI_X13(15*8); \
VPINSRQ_1_SI_X14_0; \
VPINSRQ_1_SI_X15(13*8)
// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X13(4*8); \
VMOVQ_SI_X14(6*8); \
VMOVQ_SI_X15_0; \
VPINSRQ_1_SI_X12(5*8); \
VPINSRQ_1_SI_X13(15*8); \
VPINSRQ_1_SI_X14(10*8); \
VPINSRQ_1_SI_X15(8*8)
// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
VMOVQ_SI_X12(9*8); \
VMOVQ_SI_X13(2*8); \
VMOVQ_SI_X14_0; \
VMOVQ_SI_X15(4*8); \
VPINSRQ_1_SI_X12(5*8); \
VPINSRQ_1_SI_X13(10*8); \
VPINSRQ_1_SI_X14(7*8); \
VPINSRQ_1_SI_X15(15*8)
// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X13_0; \
VMOVQ_SI_X14(12*8); \
VMOVQ_SI_X15(11*8); \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X13(8*8); \
VPINSRQ_1_SI_X14(10*8); \
VPINSRQ_1_SI_X15(3*8)
// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
MOVQ 0*8(SI), X12; \
VPSHUFD $0x4E, 8*8(SI), X13; \
MOVQ 7*8(SI), X14; \
MOVQ 2*8(SI), X15; \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X14(3*8); \
VPINSRQ_1_SI_X15(11*8)
// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
MOVQ 6*8(SI), X12; \
MOVQ 11*8(SI), X13; \
MOVQ 15*8(SI), X14; \
MOVQ 3*8(SI), X15; \
VPINSRQ_1_SI_X12(14*8); \
VPINSRQ_1_SI_X13_0; \
VPINSRQ_1_SI_X14(9*8); \
VPINSRQ_1_SI_X15(8*8)
// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
MOVQ 5*8(SI), X12; \
MOVQ 8*8(SI), X13; \
MOVQ 0*8(SI), X14; \
MOVQ 6*8(SI), X15; \
VPINSRQ_1_SI_X12(15*8); \
VPINSRQ_1_SI_X13(2*8); \
VPINSRQ_1_SI_X14(4*8); \
VPINSRQ_1_SI_X15(10*8)
// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
VMOVDQU 12*8(SI), X12; \
MOVQ 1*8(SI), X13; \
MOVQ 2*8(SI), X14; \
VPINSRQ_1_SI_X13(10*8); \
VPINSRQ_1_SI_X14(7*8); \
VMOVDQU 4*8(SI), X15
// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
MOVQ 15*8(SI), X12; \
MOVQ 3*8(SI), X13; \
MOVQ 11*8(SI), X14; \
MOVQ 12*8(SI), X15; \
VPINSRQ_1_SI_X12(9*8); \
VPINSRQ_1_SI_X13(13*8); \
VPINSRQ_1_SI_X14(14*8); \
VPINSRQ_1_SI_X15_0
// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, R10
ADDQ $15, R10
ANDQ $~15, R10
VMOVDQU ·AVX_c40<>(SB), X0
VMOVDQU ·AVX_c48<>(SB), X1
VMOVDQA X0, X8
VMOVDQA X1, X9
VMOVDQU ·AVX_iv3<>(SB), X0
VMOVDQA X0, 0(R10)
XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0)
VMOVDQU 0(AX), X10
VMOVDQU 16(AX), X11
VMOVDQU 32(AX), X2
VMOVDQU 48(AX), X3
MOVQ 0(BX), R8
MOVQ 8(BX), R9
loop:
ADDQ $128, R8
CMPQ R8, $128
JGE noinc
INCQ R9
noinc:
VMOVQ_R8_X15
VPINSRQ_1_R9_X15
VMOVDQA X10, X0
VMOVDQA X11, X1
VMOVDQU ·AVX_iv0<>(SB), X4
VMOVDQU ·AVX_iv1<>(SB), X5
VMOVDQU ·AVX_iv2<>(SB), X6
VPXOR X15, X6, X6
VMOVDQA 0(R10), X7
LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
VMOVDQA X12, 16(R10)
VMOVDQA X13, 32(R10)
VMOVDQA X14, 48(R10)
VMOVDQA X15, 64(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
VMOVDQA X12, 80(R10)
VMOVDQA X13, 96(R10)
VMOVDQA X14, 112(R10)
VMOVDQA X15, 128(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
VMOVDQA X12, 144(R10)
VMOVDQA X13, 160(R10)
VMOVDQA X14, 176(R10)
VMOVDQA X15, 192(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
VMOVDQA X12, 208(R10)
VMOVDQA X13, 224(R10)
VMOVDQA X14, 240(R10)
VMOVDQA X15, 256(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
SHUFFLE_AVX()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
SHUFFLE_AVX_INV()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
SHUFFLE_AVX()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
SHUFFLE_AVX_INV()
VMOVDQU 32(AX), X14
VMOVDQU 48(AX), X15
VPXOR X0, X10, X10
VPXOR X1, X11, X11
VPXOR X2, X14, X14
VPXOR X3, X15, X15
VPXOR X4, X10, X10
VPXOR X5, X11, X11
VPXOR X6, X14, X2
VPXOR X7, X15, X3
VMOVDQU X2, 32(AX)
VMOVDQU X3, 48(AX)
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
VMOVDQU X10, 0(AX)
VMOVDQU X11, 16(AX)
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
VZEROUPPER
RET

278
vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s generated vendored Normal file
View File

@ -0,0 +1,278 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
#include "textflag.h"
DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v6, t1; \
PUNPCKLQDQ v6, t2; \
PUNPCKHQDQ v7, v6; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ v7, t2; \
MOVO t1, v7; \
MOVO v2, t1; \
PUNPCKHQDQ t2, v7; \
PUNPCKLQDQ v3, t2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v3
#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v2, t1; \
PUNPCKLQDQ v2, t2; \
PUNPCKHQDQ v3, v2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ v3, t2; \
MOVO t1, v3; \
MOVO v6, t1; \
PUNPCKHQDQ t2, v3; \
PUNPCKLQDQ v7, t2; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v7
#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
PADDQ m0, v0; \
PADDQ m1, v1; \
PADDQ v2, v0; \
PADDQ v3, v1; \
PXOR v0, v6; \
PXOR v1, v7; \
PSHUFD $0xB1, v6, v6; \
PSHUFD $0xB1, v7, v7; \
PADDQ v6, v4; \
PADDQ v7, v5; \
PXOR v4, v2; \
PXOR v5, v3; \
PSHUFB c40, v2; \
PSHUFB c40, v3; \
PADDQ m2, v0; \
PADDQ m3, v1; \
PADDQ v2, v0; \
PADDQ v3, v1; \
PXOR v0, v6; \
PXOR v1, v7; \
PSHUFB c48, v6; \
PSHUFB c48, v7; \
PADDQ v6, v4; \
PADDQ v7, v5; \
PXOR v4, v2; \
PXOR v5, v3; \
MOVOU v2, t0; \
PADDQ v2, t0; \
PSRLQ $63, v2; \
PXOR t0, v2; \
MOVOU v3, t0; \
PADDQ v3, t0; \
PSRLQ $63, v3; \
PXOR t0, v3
#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
MOVQ i0*8(src), m0; \
PINSRQ $1, i1*8(src), m0; \
MOVQ i2*8(src), m1; \
PINSRQ $1, i3*8(src), m1; \
MOVQ i4*8(src), m2; \
PINSRQ $1, i5*8(src), m2; \
MOVQ i6*8(src), m3; \
PINSRQ $1, i7*8(src), m3
// func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, R10
ADDQ $15, R10
ANDQ $~15, R10
MOVOU ·iv3<>(SB), X0
MOVO X0, 0(R10)
XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0)
MOVOU ·c40<>(SB), X13
MOVOU ·c48<>(SB), X14
MOVOU 0(AX), X12
MOVOU 16(AX), X15
MOVQ 0(BX), R8
MOVQ 8(BX), R9
loop:
ADDQ $128, R8
CMPQ R8, $128
JGE noinc
INCQ R9
noinc:
MOVQ R8, X8
PINSRQ $1, R9, X8
MOVO X12, X0
MOVO X15, X1
MOVOU 32(AX), X2
MOVOU 48(AX), X3
MOVOU ·iv0<>(SB), X4
MOVOU ·iv1<>(SB), X5
MOVOU ·iv2<>(SB), X6
PXOR X8, X6
MOVO 0(R10), X7
LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
MOVO X8, 16(R10)
MOVO X9, 32(R10)
MOVO X10, 48(R10)
MOVO X11, 64(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
MOVO X8, 80(R10)
MOVO X9, 96(R10)
MOVO X10, 112(R10)
MOVO X11, 128(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
MOVO X8, 144(R10)
MOVO X9, 160(R10)
MOVO X10, 176(R10)
MOVO X11, 192(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
MOVO X8, 208(R10)
MOVO X9, 224(R10)
MOVO X10, 240(R10)
MOVO X11, 256(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
MOVOU 32(AX), X10
MOVOU 48(AX), X11
PXOR X0, X12
PXOR X1, X15
PXOR X2, X10
PXOR X3, X11
PXOR X4, X12
PXOR X5, X15
PXOR X6, X10
PXOR X7, X11
MOVOU X10, 32(AX)
MOVOU X11, 48(AX)
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
MOVOU X12, 0(AX)
MOVOU X15, 16(AX)
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
RET

182
vendor/golang.org/x/crypto/blake2b/blake2b_generic.go generated vendored Normal file
View File

@ -0,0 +1,182 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"encoding/binary"
"math/bits"
)
// the precomputed values for BLAKE2b
// there are 12 16-byte arrays - one for each round
// the entries are calculated from the sigma constants.
var precomputed = [12][16]byte{
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, // equal to the first
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, // equal to the second
}
func hashBlocksGeneric(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
var m [16]uint64
c0, c1 := c[0], c[1]
for i := 0; i < len(blocks); {
c0 += BlockSize
if c0 < BlockSize {
c1++
}
v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7]
v12 ^= c0
v13 ^= c1
v14 ^= flag
for j := range m {
m[j] = binary.LittleEndian.Uint64(blocks[i:])
i += 8
}
for j := range precomputed {
s := &(precomputed[j])
v0 += m[s[0]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft64(v12, -32)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft64(v4, -24)
v1 += m[s[1]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft64(v13, -32)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft64(v5, -24)
v2 += m[s[2]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft64(v14, -32)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft64(v6, -24)
v3 += m[s[3]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft64(v15, -32)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft64(v7, -24)
v0 += m[s[4]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft64(v12, -16)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft64(v4, -63)
v1 += m[s[5]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft64(v13, -16)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft64(v5, -63)
v2 += m[s[6]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft64(v14, -16)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft64(v6, -63)
v3 += m[s[7]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft64(v15, -16)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft64(v7, -63)
v0 += m[s[8]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft64(v15, -32)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft64(v5, -24)
v1 += m[s[9]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft64(v12, -32)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft64(v6, -24)
v2 += m[s[10]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft64(v13, -32)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft64(v7, -24)
v3 += m[s[11]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft64(v14, -32)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft64(v4, -24)
v0 += m[s[12]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft64(v15, -16)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft64(v5, -63)
v1 += m[s[13]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft64(v12, -16)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft64(v6, -63)
v2 += m[s[14]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft64(v13, -16)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft64(v7, -63)
v3 += m[s[15]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft64(v14, -16)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft64(v4, -63)
}
h[0] ^= v0 ^ v8
h[1] ^= v1 ^ v9
h[2] ^= v2 ^ v10
h[3] ^= v3 ^ v11
h[4] ^= v4 ^ v12
h[5] ^= v5 ^ v13
h[6] ^= v6 ^ v14
h[7] ^= v7 ^ v15
}
c[0], c[1] = c0, c1
}

11
vendor/golang.org/x/crypto/blake2b/blake2b_ref.go generated vendored Normal file
View File

@ -0,0 +1,11 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || purego || !gc
package blake2b
func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
hashBlocksGeneric(h, c, flag, blocks)
}

177
vendor/golang.org/x/crypto/blake2b/blake2x.go generated vendored Normal file
View File

@ -0,0 +1,177 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"encoding/binary"
"errors"
"io"
)
// XOF defines the interface to hash functions that
// support arbitrary-length output.
type XOF interface {
// Write absorbs more data into the hash's state. It panics if called
// after Read.
io.Writer
// Read reads more output from the hash. It returns io.EOF if the limit
// has been reached.
io.Reader
// Clone returns a copy of the XOF in its current state.
Clone() XOF
// Reset resets the XOF to its initial state.
Reset()
}
// OutputLengthUnknown can be used as the size argument to NewXOF to indicate
// the length of the output is not known in advance.
const OutputLengthUnknown = 0
// magicUnknownOutputLength is a magic value for the output size that indicates
// an unknown number of output bytes.
const magicUnknownOutputLength = (1 << 32) - 1
// maxOutputLength is the absolute maximum number of bytes to produce when the
// number of output bytes is unknown.
const maxOutputLength = (1 << 32) * 64
// NewXOF creates a new variable-output-length hash. The hash either produce a
// known number of bytes (1 <= size < 2**32-1), or an unknown number of bytes
// (size == OutputLengthUnknown). In the latter case, an absolute limit of
// 256GiB applies.
//
// A non-nil key turns the hash into a MAC. The key must between
// zero and 32 bytes long.
func NewXOF(size uint32, key []byte) (XOF, error) {
if len(key) > Size {
return nil, errKeySize
}
if size == magicUnknownOutputLength {
// 2^32-1 indicates an unknown number of bytes and thus isn't a
// valid length.
return nil, errors.New("blake2b: XOF length too large")
}
if size == OutputLengthUnknown {
size = magicUnknownOutputLength
}
x := &xof{
d: digest{
size: Size,
keyLen: len(key),
},
length: size,
}
copy(x.d.key[:], key)
x.Reset()
return x, nil
}
type xof struct {
d digest
length uint32
remaining uint64
cfg, root, block [Size]byte
offset int
nodeOffset uint32
readMode bool
}
func (x *xof) Write(p []byte) (n int, err error) {
if x.readMode {
panic("blake2b: write to XOF after read")
}
return x.d.Write(p)
}
func (x *xof) Clone() XOF {
clone := *x
return &clone
}
func (x *xof) Reset() {
x.cfg[0] = byte(Size)
binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length
binary.LittleEndian.PutUint32(x.cfg[12:], x.length) // XOF length
x.cfg[17] = byte(Size) // inner hash size
x.d.Reset()
x.d.h[1] ^= uint64(x.length) << 32
x.remaining = uint64(x.length)
if x.remaining == magicUnknownOutputLength {
x.remaining = maxOutputLength
}
x.offset, x.nodeOffset = 0, 0
x.readMode = false
}
func (x *xof) Read(p []byte) (n int, err error) {
if !x.readMode {
x.d.finalize(&x.root)
x.readMode = true
}
if x.remaining == 0 {
return 0, io.EOF
}
n = len(p)
if uint64(n) > x.remaining {
n = int(x.remaining)
p = p[:n]
}
if x.offset > 0 {
blockRemaining := Size - x.offset
if n < blockRemaining {
x.offset += copy(p, x.block[x.offset:])
x.remaining -= uint64(n)
return
}
copy(p, x.block[x.offset:])
p = p[blockRemaining:]
x.offset = 0
x.remaining -= uint64(blockRemaining)
}
for len(p) >= Size {
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
copy(p, x.block[:])
p = p[Size:]
x.remaining -= uint64(Size)
}
if todo := len(p); todo > 0 {
if x.remaining < uint64(Size) {
x.cfg[0] = byte(x.remaining)
}
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
x.offset = copy(p, x.block[:todo])
x.remaining -= uint64(todo)
}
return
}
func (d *digest) initConfig(cfg *[Size]byte) {
d.offset, d.c[0], d.c[1] = 0, 0, 0
for i := range d.h {
d.h[i] = iv[i] ^ binary.LittleEndian.Uint64(cfg[i*8:])
}
}

30
vendor/golang.org/x/crypto/blake2b/register.go generated vendored Normal file
View File

@ -0,0 +1,30 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"crypto"
"hash"
)
func init() {
newHash256 := func() hash.Hash {
h, _ := New256(nil)
return h
}
newHash384 := func() hash.Hash {
h, _ := New384(nil)
return h
}
newHash512 := func() hash.Hash {
h, _ := New512(nil)
return h
}
crypto.RegisterHash(crypto.BLAKE2b_256, newHash256)
crypto.RegisterHash(crypto.BLAKE2b_384, newHash384)
crypto.RegisterHash(crypto.BLAKE2b_512, newHash512)
}

173
vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go generated vendored Normal file
View File

@ -0,0 +1,173 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package secretbox encrypts and authenticates small messages.
Secretbox uses XSalsa20 and Poly1305 to encrypt and authenticate messages with
secret-key cryptography. The length of messages is not hidden.
It is the caller's responsibility to ensure the uniqueness of nonces—for
example, by using nonce 1 for the first message, nonce 2 for the second
message, etc. Nonces are long enough that randomly generated nonces have
negligible risk of collision.
Messages should be small because:
1. The whole message needs to be held in memory to be processed.
2. Using large messages pressures implementations on small machines to decrypt
and process plaintext before authenticating it. This is very dangerous, and
this API does not allow it, but a protocol that uses excessive message sizes
might present some implementations with no other choice.
3. Fixed overheads will be sufficiently amortised by messages as small as 8KB.
4. Performance may be improved by working with messages that fit into data caches.
Thus large amounts of data should be chunked so that each message is small.
(Each message still needs a unique nonce.) If in doubt, 16KB is a reasonable
chunk size.
This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html.
*/
package secretbox
import (
"golang.org/x/crypto/internal/alias"
"golang.org/x/crypto/internal/poly1305"
"golang.org/x/crypto/salsa20/salsa"
)
// Overhead is the number of bytes of overhead when boxing a message.
const Overhead = poly1305.TagSize
// setup produces a sub-key and Salsa20 counter given a nonce and key.
func setup(subKey *[32]byte, counter *[16]byte, nonce *[24]byte, key *[32]byte) {
// We use XSalsa20 for encryption so first we need to generate a
// key and nonce with HSalsa20.
var hNonce [16]byte
copy(hNonce[:], nonce[:])
salsa.HSalsa20(subKey, &hNonce, key, &salsa.Sigma)
// The final 8 bytes of the original nonce form the new nonce.
copy(counter[:], nonce[16:])
}
// sliceForAppend takes a slice and a requested number of bytes. It returns a
// slice with the contents of the given slice followed by that many bytes and a
// second slice that aliases into it and contains only the extra bytes. If the
// original slice has sufficient capacity then no allocation is performed.
func sliceForAppend(in []byte, n int) (head, tail []byte) {
if total := len(in) + n; cap(in) >= total {
head = in[:total]
} else {
head = make([]byte, total)
copy(head, in)
}
tail = head[len(in):]
return
}
// Seal appends an encrypted and authenticated copy of message to out, which
// must not overlap message. The key and nonce pair must be unique for each
// distinct message and the output will be Overhead bytes longer than message.
func Seal(out, message []byte, nonce *[24]byte, key *[32]byte) []byte {
var subKey [32]byte
var counter [16]byte
setup(&subKey, &counter, nonce, key)
// The Poly1305 key is generated by encrypting 32 bytes of zeros. Since
// Salsa20 works with 64-byte blocks, we also generate 32 bytes of
// keystream as a side effect.
var firstBlock [64]byte
salsa.XORKeyStream(firstBlock[:], firstBlock[:], &counter, &subKey)
var poly1305Key [32]byte
copy(poly1305Key[:], firstBlock[:])
ret, out := sliceForAppend(out, len(message)+poly1305.TagSize)
if alias.AnyOverlap(out, message) {
panic("nacl: invalid buffer overlap")
}
// We XOR up to 32 bytes of message with the keystream generated from
// the first block.
firstMessageBlock := message
if len(firstMessageBlock) > 32 {
firstMessageBlock = firstMessageBlock[:32]
}
tagOut := out
out = out[poly1305.TagSize:]
for i, x := range firstMessageBlock {
out[i] = firstBlock[32+i] ^ x
}
message = message[len(firstMessageBlock):]
ciphertext := out
out = out[len(firstMessageBlock):]
// Now encrypt the rest.
counter[8] = 1
salsa.XORKeyStream(out, message, &counter, &subKey)
var tag [poly1305.TagSize]byte
poly1305.Sum(&tag, ciphertext, &poly1305Key)
copy(tagOut, tag[:])
return ret
}
// Open authenticates and decrypts a box produced by Seal and appends the
// message to out, which must not overlap box. The output will be Overhead
// bytes smaller than box.
func Open(out, box []byte, nonce *[24]byte, key *[32]byte) ([]byte, bool) {
if len(box) < Overhead {
return nil, false
}
var subKey [32]byte
var counter [16]byte
setup(&subKey, &counter, nonce, key)
// The Poly1305 key is generated by encrypting 32 bytes of zeros. Since
// Salsa20 works with 64-byte blocks, we also generate 32 bytes of
// keystream as a side effect.
var firstBlock [64]byte
salsa.XORKeyStream(firstBlock[:], firstBlock[:], &counter, &subKey)
var poly1305Key [32]byte
copy(poly1305Key[:], firstBlock[:])
var tag [poly1305.TagSize]byte
copy(tag[:], box)
if !poly1305.Verify(&tag, box[poly1305.TagSize:], &poly1305Key) {
return nil, false
}
ret, out := sliceForAppend(out, len(box)-Overhead)
if alias.AnyOverlap(out, box) {
panic("nacl: invalid buffer overlap")
}
// We XOR up to 32 bytes of box with the keystream generated from
// the first block.
box = box[Overhead:]
firstMessageBlock := box
if len(firstMessageBlock) > 32 {
firstMessageBlock = firstMessageBlock[:32]
}
for i, x := range firstMessageBlock {
out[i] = firstBlock[32+i] ^ x
}
box = box[len(firstMessageBlock):]
out = out[len(firstMessageBlock):]
// Now decrypt the rest.
counter[8] = 1
salsa.XORKeyStream(out, box, &counter, &subKey)
return ret, true
}

146
vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go generated vendored Normal file
View File

@ -0,0 +1,146 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package salsa provides low-level access to functions in the Salsa family.
package salsa
import "math/bits"
// Sigma is the Salsa20 constant for 256-bit keys.
var Sigma = [16]byte{'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k'}
// HSalsa20 applies the HSalsa20 core function to a 16-byte input in, 32-byte
// key k, and 16-byte constant c, and puts the result into the 32-byte array
// out.
func HSalsa20(out *[32]byte, in *[16]byte, k *[32]byte, c *[16]byte) {
x0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24
x1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24
x2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24
x3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24
x4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24
x5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24
x6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
x7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
x8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
x9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
x10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24
x11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24
x12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24
x13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24
x14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24
x15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24
for i := 0; i < 20; i += 2 {
u := x0 + x12
x4 ^= bits.RotateLeft32(u, 7)
u = x4 + x0
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x4
x12 ^= bits.RotateLeft32(u, 13)
u = x12 + x8
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x1
x9 ^= bits.RotateLeft32(u, 7)
u = x9 + x5
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x9
x1 ^= bits.RotateLeft32(u, 13)
u = x1 + x13
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x6
x14 ^= bits.RotateLeft32(u, 7)
u = x14 + x10
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x14
x6 ^= bits.RotateLeft32(u, 13)
u = x6 + x2
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x11
x3 ^= bits.RotateLeft32(u, 7)
u = x3 + x15
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x3
x11 ^= bits.RotateLeft32(u, 13)
u = x11 + x7
x15 ^= bits.RotateLeft32(u, 18)
u = x0 + x3
x1 ^= bits.RotateLeft32(u, 7)
u = x1 + x0
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x1
x3 ^= bits.RotateLeft32(u, 13)
u = x3 + x2
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x4
x6 ^= bits.RotateLeft32(u, 7)
u = x6 + x5
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x6
x4 ^= bits.RotateLeft32(u, 13)
u = x4 + x7
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x9
x11 ^= bits.RotateLeft32(u, 7)
u = x11 + x10
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x11
x9 ^= bits.RotateLeft32(u, 13)
u = x9 + x8
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x14
x12 ^= bits.RotateLeft32(u, 7)
u = x12 + x15
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x12
x14 ^= bits.RotateLeft32(u, 13)
u = x14 + x13
x15 ^= bits.RotateLeft32(u, 18)
}
out[0] = byte(x0)
out[1] = byte(x0 >> 8)
out[2] = byte(x0 >> 16)
out[3] = byte(x0 >> 24)
out[4] = byte(x5)
out[5] = byte(x5 >> 8)
out[6] = byte(x5 >> 16)
out[7] = byte(x5 >> 24)
out[8] = byte(x10)
out[9] = byte(x10 >> 8)
out[10] = byte(x10 >> 16)
out[11] = byte(x10 >> 24)
out[12] = byte(x15)
out[13] = byte(x15 >> 8)
out[14] = byte(x15 >> 16)
out[15] = byte(x15 >> 24)
out[16] = byte(x6)
out[17] = byte(x6 >> 8)
out[18] = byte(x6 >> 16)
out[19] = byte(x6 >> 24)
out[20] = byte(x7)
out[21] = byte(x7 >> 8)
out[22] = byte(x7 >> 16)
out[23] = byte(x7 >> 24)
out[24] = byte(x8)
out[25] = byte(x8 >> 8)
out[26] = byte(x8 >> 16)
out[27] = byte(x8 >> 24)
out[28] = byte(x9)
out[29] = byte(x9 >> 8)
out[30] = byte(x9 >> 16)
out[31] = byte(x9 >> 24)
}

201
vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go generated vendored Normal file
View File

@ -0,0 +1,201 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package salsa
import "math/bits"
// Core208 applies the Salsa20/8 core function to the 64-byte array in and puts
// the result into the 64-byte array out. The input and output may be the same array.
func Core208(out *[64]byte, in *[64]byte) {
j0 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
j1 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
j2 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
j3 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
j4 := uint32(in[16]) | uint32(in[17])<<8 | uint32(in[18])<<16 | uint32(in[19])<<24
j5 := uint32(in[20]) | uint32(in[21])<<8 | uint32(in[22])<<16 | uint32(in[23])<<24
j6 := uint32(in[24]) | uint32(in[25])<<8 | uint32(in[26])<<16 | uint32(in[27])<<24
j7 := uint32(in[28]) | uint32(in[29])<<8 | uint32(in[30])<<16 | uint32(in[31])<<24
j8 := uint32(in[32]) | uint32(in[33])<<8 | uint32(in[34])<<16 | uint32(in[35])<<24
j9 := uint32(in[36]) | uint32(in[37])<<8 | uint32(in[38])<<16 | uint32(in[39])<<24
j10 := uint32(in[40]) | uint32(in[41])<<8 | uint32(in[42])<<16 | uint32(in[43])<<24
j11 := uint32(in[44]) | uint32(in[45])<<8 | uint32(in[46])<<16 | uint32(in[47])<<24
j12 := uint32(in[48]) | uint32(in[49])<<8 | uint32(in[50])<<16 | uint32(in[51])<<24
j13 := uint32(in[52]) | uint32(in[53])<<8 | uint32(in[54])<<16 | uint32(in[55])<<24
j14 := uint32(in[56]) | uint32(in[57])<<8 | uint32(in[58])<<16 | uint32(in[59])<<24
j15 := uint32(in[60]) | uint32(in[61])<<8 | uint32(in[62])<<16 | uint32(in[63])<<24
x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8
x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15
for i := 0; i < 8; i += 2 {
u := x0 + x12
x4 ^= bits.RotateLeft32(u, 7)
u = x4 + x0
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x4
x12 ^= bits.RotateLeft32(u, 13)
u = x12 + x8
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x1
x9 ^= bits.RotateLeft32(u, 7)
u = x9 + x5
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x9
x1 ^= bits.RotateLeft32(u, 13)
u = x1 + x13
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x6
x14 ^= bits.RotateLeft32(u, 7)
u = x14 + x10
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x14
x6 ^= bits.RotateLeft32(u, 13)
u = x6 + x2
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x11
x3 ^= bits.RotateLeft32(u, 7)
u = x3 + x15
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x3
x11 ^= bits.RotateLeft32(u, 13)
u = x11 + x7
x15 ^= bits.RotateLeft32(u, 18)
u = x0 + x3
x1 ^= bits.RotateLeft32(u, 7)
u = x1 + x0
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x1
x3 ^= bits.RotateLeft32(u, 13)
u = x3 + x2
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x4
x6 ^= bits.RotateLeft32(u, 7)
u = x6 + x5
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x6
x4 ^= bits.RotateLeft32(u, 13)
u = x4 + x7
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x9
x11 ^= bits.RotateLeft32(u, 7)
u = x11 + x10
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x11
x9 ^= bits.RotateLeft32(u, 13)
u = x9 + x8
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x14
x12 ^= bits.RotateLeft32(u, 7)
u = x12 + x15
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x12
x14 ^= bits.RotateLeft32(u, 13)
u = x14 + x13
x15 ^= bits.RotateLeft32(u, 18)
}
x0 += j0
x1 += j1
x2 += j2
x3 += j3
x4 += j4
x5 += j5
x6 += j6
x7 += j7
x8 += j8
x9 += j9
x10 += j10
x11 += j11
x12 += j12
x13 += j13
x14 += j14
x15 += j15
out[0] = byte(x0)
out[1] = byte(x0 >> 8)
out[2] = byte(x0 >> 16)
out[3] = byte(x0 >> 24)
out[4] = byte(x1)
out[5] = byte(x1 >> 8)
out[6] = byte(x1 >> 16)
out[7] = byte(x1 >> 24)
out[8] = byte(x2)
out[9] = byte(x2 >> 8)
out[10] = byte(x2 >> 16)
out[11] = byte(x2 >> 24)
out[12] = byte(x3)
out[13] = byte(x3 >> 8)
out[14] = byte(x3 >> 16)
out[15] = byte(x3 >> 24)
out[16] = byte(x4)
out[17] = byte(x4 >> 8)
out[18] = byte(x4 >> 16)
out[19] = byte(x4 >> 24)
out[20] = byte(x5)
out[21] = byte(x5 >> 8)
out[22] = byte(x5 >> 16)
out[23] = byte(x5 >> 24)
out[24] = byte(x6)
out[25] = byte(x6 >> 8)
out[26] = byte(x6 >> 16)
out[27] = byte(x6 >> 24)
out[28] = byte(x7)
out[29] = byte(x7 >> 8)
out[30] = byte(x7 >> 16)
out[31] = byte(x7 >> 24)
out[32] = byte(x8)
out[33] = byte(x8 >> 8)
out[34] = byte(x8 >> 16)
out[35] = byte(x8 >> 24)
out[36] = byte(x9)
out[37] = byte(x9 >> 8)
out[38] = byte(x9 >> 16)
out[39] = byte(x9 >> 24)
out[40] = byte(x10)
out[41] = byte(x10 >> 8)
out[42] = byte(x10 >> 16)
out[43] = byte(x10 >> 24)
out[44] = byte(x11)
out[45] = byte(x11 >> 8)
out[46] = byte(x11 >> 16)
out[47] = byte(x11 >> 24)
out[48] = byte(x12)
out[49] = byte(x12 >> 8)
out[50] = byte(x12 >> 16)
out[51] = byte(x12 >> 24)
out[52] = byte(x13)
out[53] = byte(x13 >> 8)
out[54] = byte(x13 >> 16)
out[55] = byte(x13 >> 24)
out[56] = byte(x14)
out[57] = byte(x14 >> 8)
out[58] = byte(x14 >> 16)
out[59] = byte(x14 >> 24)
out[60] = byte(x15)
out[61] = byte(x15 >> 8)
out[62] = byte(x15 >> 16)
out[63] = byte(x15 >> 24)
}

View File

@ -0,0 +1,23 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !purego && gc
package salsa
//go:noescape
// salsa2020XORKeyStream is implemented in salsa20_amd64.s.
func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
// XORKeyStream crypts bytes from in to out using the given key and counters.
// In and out must overlap entirely or not at all. Counter
// contains the raw salsa20 counter bytes (both nonce and block counter).
func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
if len(in) == 0 {
return
}
_ = out[len(in)-1]
salsa2020XORKeyStream(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0])
}

View File

@ -0,0 +1,880 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !purego && gc
// This code was translated into a form compatible with 6a from the public
// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVQ out+0(FP),DI
MOVQ in+8(FP),SI
MOVQ n+16(FP),DX
MOVQ nonce+24(FP),CX
MOVQ key+32(FP),R8
MOVQ SP,R12
ADDQ $31, R12
ANDQ $~31, R12
MOVQ DX,R9
MOVQ CX,DX
MOVQ R8,R10
CMPQ R9,$0
JBE DONE
START:
MOVL 20(R10),CX
MOVL 0(R10),R8
MOVL 0(DX),AX
MOVL 16(R10),R11
MOVL CX,0(R12)
MOVL R8, 4 (R12)
MOVL AX, 8 (R12)
MOVL R11, 12 (R12)
MOVL 8(DX),CX
MOVL 24(R10),R8
MOVL 4(R10),AX
MOVL 4(DX),R11
MOVL CX,16(R12)
MOVL R8, 20 (R12)
MOVL AX, 24 (R12)
MOVL R11, 28 (R12)
MOVL 12(DX),CX
MOVL 12(R10),DX
MOVL 28(R10),R8
MOVL 8(R10),AX
MOVL DX,32(R12)
MOVL CX, 36 (R12)
MOVL R8, 40 (R12)
MOVL AX, 44 (R12)
MOVQ $1634760805,DX
MOVQ $857760878,CX
MOVQ $2036477234,R8
MOVQ $1797285236,AX
MOVL DX,48(R12)
MOVL CX, 52 (R12)
MOVL R8, 56 (R12)
MOVL AX, 60 (R12)
CMPQ R9,$256
JB BYTESBETWEEN1AND255
MOVOA 48(R12),X0
PSHUFL $0X55,X0,X1
PSHUFL $0XAA,X0,X2
PSHUFL $0XFF,X0,X3
PSHUFL $0X00,X0,X0
MOVOA X1,64(R12)
MOVOA X2,80(R12)
MOVOA X3,96(R12)
MOVOA X0,112(R12)
MOVOA 0(R12),X0
PSHUFL $0XAA,X0,X1
PSHUFL $0XFF,X0,X2
PSHUFL $0X00,X0,X3
PSHUFL $0X55,X0,X0
MOVOA X1,128(R12)
MOVOA X2,144(R12)
MOVOA X3,160(R12)
MOVOA X0,176(R12)
MOVOA 16(R12),X0
PSHUFL $0XFF,X0,X1
PSHUFL $0X55,X0,X2
PSHUFL $0XAA,X0,X0
MOVOA X1,192(R12)
MOVOA X2,208(R12)
MOVOA X0,224(R12)
MOVOA 32(R12),X0
PSHUFL $0X00,X0,X1
PSHUFL $0XAA,X0,X2
PSHUFL $0XFF,X0,X0
MOVOA X1,240(R12)
MOVOA X2,256(R12)
MOVOA X0,272(R12)
BYTESATLEAST256:
MOVL 16(R12),DX
MOVL 36 (R12),CX
MOVL DX,288(R12)
MOVL CX,304(R12)
SHLQ $32,CX
ADDQ CX,DX
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX, 292 (R12)
MOVL CX, 308 (R12)
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX, 296 (R12)
MOVL CX, 312 (R12)
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX, 300 (R12)
MOVL CX, 316 (R12)
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX,16(R12)
MOVL CX, 36 (R12)
MOVQ R9,352(R12)
MOVQ $20,DX
MOVOA 64(R12),X0
MOVOA 80(R12),X1
MOVOA 96(R12),X2
MOVOA 256(R12),X3
MOVOA 272(R12),X4
MOVOA 128(R12),X5
MOVOA 144(R12),X6
MOVOA 176(R12),X7
MOVOA 192(R12),X8
MOVOA 208(R12),X9
MOVOA 224(R12),X10
MOVOA 304(R12),X11
MOVOA 112(R12),X12
MOVOA 160(R12),X13
MOVOA 240(R12),X14
MOVOA 288(R12),X15
MAINLOOP1:
MOVOA X1,320(R12)
MOVOA X2,336(R12)
MOVOA X13,X1
PADDL X12,X1
MOVOA X1,X2
PSLLL $7,X1
PXOR X1,X14
PSRLL $25,X2
PXOR X2,X14
MOVOA X7,X1
PADDL X0,X1
MOVOA X1,X2
PSLLL $7,X1
PXOR X1,X11
PSRLL $25,X2
PXOR X2,X11
MOVOA X12,X1
PADDL X14,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X15
PSRLL $23,X2
PXOR X2,X15
MOVOA X0,X1
PADDL X11,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X9
PSRLL $23,X2
PXOR X2,X9
MOVOA X14,X1
PADDL X15,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X13
PSRLL $19,X2
PXOR X2,X13
MOVOA X11,X1
PADDL X9,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X7
PSRLL $19,X2
PXOR X2,X7
MOVOA X15,X1
PADDL X13,X1
MOVOA X1,X2
PSLLL $18,X1
PXOR X1,X12
PSRLL $14,X2
PXOR X2,X12
MOVOA 320(R12),X1
MOVOA X12,320(R12)
MOVOA X9,X2
PADDL X7,X2
MOVOA X2,X12
PSLLL $18,X2
PXOR X2,X0
PSRLL $14,X12
PXOR X12,X0
MOVOA X5,X2
PADDL X1,X2
MOVOA X2,X12
PSLLL $7,X2
PXOR X2,X3
PSRLL $25,X12
PXOR X12,X3
MOVOA 336(R12),X2
MOVOA X0,336(R12)
MOVOA X6,X0
PADDL X2,X0
MOVOA X0,X12
PSLLL $7,X0
PXOR X0,X4
PSRLL $25,X12
PXOR X12,X4
MOVOA X1,X0
PADDL X3,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X10
PSRLL $23,X12
PXOR X12,X10
MOVOA X2,X0
PADDL X4,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X8
PSRLL $23,X12
PXOR X12,X8
MOVOA X3,X0
PADDL X10,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X5
PSRLL $19,X12
PXOR X12,X5
MOVOA X4,X0
PADDL X8,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X6
PSRLL $19,X12
PXOR X12,X6
MOVOA X10,X0
PADDL X5,X0
MOVOA X0,X12
PSLLL $18,X0
PXOR X0,X1
PSRLL $14,X12
PXOR X12,X1
MOVOA 320(R12),X0
MOVOA X1,320(R12)
MOVOA X4,X1
PADDL X0,X1
MOVOA X1,X12
PSLLL $7,X1
PXOR X1,X7
PSRLL $25,X12
PXOR X12,X7
MOVOA X8,X1
PADDL X6,X1
MOVOA X1,X12
PSLLL $18,X1
PXOR X1,X2
PSRLL $14,X12
PXOR X12,X2
MOVOA 336(R12),X12
MOVOA X2,336(R12)
MOVOA X14,X1
PADDL X12,X1
MOVOA X1,X2
PSLLL $7,X1
PXOR X1,X5
PSRLL $25,X2
PXOR X2,X5
MOVOA X0,X1
PADDL X7,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X10
PSRLL $23,X2
PXOR X2,X10
MOVOA X12,X1
PADDL X5,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X8
PSRLL $23,X2
PXOR X2,X8
MOVOA X7,X1
PADDL X10,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X4
PSRLL $19,X2
PXOR X2,X4
MOVOA X5,X1
PADDL X8,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X14
PSRLL $19,X2
PXOR X2,X14
MOVOA X10,X1
PADDL X4,X1
MOVOA X1,X2
PSLLL $18,X1
PXOR X1,X0
PSRLL $14,X2
PXOR X2,X0
MOVOA 320(R12),X1
MOVOA X0,320(R12)
MOVOA X8,X0
PADDL X14,X0
MOVOA X0,X2
PSLLL $18,X0
PXOR X0,X12
PSRLL $14,X2
PXOR X2,X12
MOVOA X11,X0
PADDL X1,X0
MOVOA X0,X2
PSLLL $7,X0
PXOR X0,X6
PSRLL $25,X2
PXOR X2,X6
MOVOA 336(R12),X2
MOVOA X12,336(R12)
MOVOA X3,X0
PADDL X2,X0
MOVOA X0,X12
PSLLL $7,X0
PXOR X0,X13
PSRLL $25,X12
PXOR X12,X13
MOVOA X1,X0
PADDL X6,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X15
PSRLL $23,X12
PXOR X12,X15
MOVOA X2,X0
PADDL X13,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X9
PSRLL $23,X12
PXOR X12,X9
MOVOA X6,X0
PADDL X15,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X11
PSRLL $19,X12
PXOR X12,X11
MOVOA X13,X0
PADDL X9,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X3
PSRLL $19,X12
PXOR X12,X3
MOVOA X15,X0
PADDL X11,X0
MOVOA X0,X12
PSLLL $18,X0
PXOR X0,X1
PSRLL $14,X12
PXOR X12,X1
MOVOA X9,X0
PADDL X3,X0
MOVOA X0,X12
PSLLL $18,X0
PXOR X0,X2
PSRLL $14,X12
PXOR X12,X2
MOVOA 320(R12),X12
MOVOA 336(R12),X0
SUBQ $2,DX
JA MAINLOOP1
PADDL 112(R12),X12
PADDL 176(R12),X7
PADDL 224(R12),X10
PADDL 272(R12),X4
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
PSHUFL $0X39,X12,X12
PSHUFL $0X39,X7,X7
PSHUFL $0X39,X10,X10
PSHUFL $0X39,X4,X4
XORL 0(SI),DX
XORL 4(SI),CX
XORL 8(SI),R8
XORL 12(SI),R9
MOVL DX,0(DI)
MOVL CX,4(DI)
MOVL R8,8(DI)
MOVL R9,12(DI)
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
PSHUFL $0X39,X12,X12
PSHUFL $0X39,X7,X7
PSHUFL $0X39,X10,X10
PSHUFL $0X39,X4,X4
XORL 64(SI),DX
XORL 68(SI),CX
XORL 72(SI),R8
XORL 76(SI),R9
MOVL DX,64(DI)
MOVL CX,68(DI)
MOVL R8,72(DI)
MOVL R9,76(DI)
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
PSHUFL $0X39,X12,X12
PSHUFL $0X39,X7,X7
PSHUFL $0X39,X10,X10
PSHUFL $0X39,X4,X4
XORL 128(SI),DX
XORL 132(SI),CX
XORL 136(SI),R8
XORL 140(SI),R9
MOVL DX,128(DI)
MOVL CX,132(DI)
MOVL R8,136(DI)
MOVL R9,140(DI)
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
XORL 192(SI),DX
XORL 196(SI),CX
XORL 200(SI),R8
XORL 204(SI),R9
MOVL DX,192(DI)
MOVL CX,196(DI)
MOVL R8,200(DI)
MOVL R9,204(DI)
PADDL 240(R12),X14
PADDL 64(R12),X0
PADDL 128(R12),X5
PADDL 192(R12),X8
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
PSHUFL $0X39,X14,X14
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X5,X5
PSHUFL $0X39,X8,X8
XORL 16(SI),DX
XORL 20(SI),CX
XORL 24(SI),R8
XORL 28(SI),R9
MOVL DX,16(DI)
MOVL CX,20(DI)
MOVL R8,24(DI)
MOVL R9,28(DI)
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
PSHUFL $0X39,X14,X14
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X5,X5
PSHUFL $0X39,X8,X8
XORL 80(SI),DX
XORL 84(SI),CX
XORL 88(SI),R8
XORL 92(SI),R9
MOVL DX,80(DI)
MOVL CX,84(DI)
MOVL R8,88(DI)
MOVL R9,92(DI)
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
PSHUFL $0X39,X14,X14
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X5,X5
PSHUFL $0X39,X8,X8
XORL 144(SI),DX
XORL 148(SI),CX
XORL 152(SI),R8
XORL 156(SI),R9
MOVL DX,144(DI)
MOVL CX,148(DI)
MOVL R8,152(DI)
MOVL R9,156(DI)
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
XORL 208(SI),DX
XORL 212(SI),CX
XORL 216(SI),R8
XORL 220(SI),R9
MOVL DX,208(DI)
MOVL CX,212(DI)
MOVL R8,216(DI)
MOVL R9,220(DI)
PADDL 288(R12),X15
PADDL 304(R12),X11
PADDL 80(R12),X1
PADDL 144(R12),X6
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
PSHUFL $0X39,X15,X15
PSHUFL $0X39,X11,X11
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X6,X6
XORL 32(SI),DX
XORL 36(SI),CX
XORL 40(SI),R8
XORL 44(SI),R9
MOVL DX,32(DI)
MOVL CX,36(DI)
MOVL R8,40(DI)
MOVL R9,44(DI)
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
PSHUFL $0X39,X15,X15
PSHUFL $0X39,X11,X11
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X6,X6
XORL 96(SI),DX
XORL 100(SI),CX
XORL 104(SI),R8
XORL 108(SI),R9
MOVL DX,96(DI)
MOVL CX,100(DI)
MOVL R8,104(DI)
MOVL R9,108(DI)
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
PSHUFL $0X39,X15,X15
PSHUFL $0X39,X11,X11
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X6,X6
XORL 160(SI),DX
XORL 164(SI),CX
XORL 168(SI),R8
XORL 172(SI),R9
MOVL DX,160(DI)
MOVL CX,164(DI)
MOVL R8,168(DI)
MOVL R9,172(DI)
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
XORL 224(SI),DX
XORL 228(SI),CX
XORL 232(SI),R8
XORL 236(SI),R9
MOVL DX,224(DI)
MOVL CX,228(DI)
MOVL R8,232(DI)
MOVL R9,236(DI)
PADDL 160(R12),X13
PADDL 208(R12),X9
PADDL 256(R12),X3
PADDL 96(R12),X2
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
PSHUFL $0X39,X13,X13
PSHUFL $0X39,X9,X9
PSHUFL $0X39,X3,X3
PSHUFL $0X39,X2,X2
XORL 48(SI),DX
XORL 52(SI),CX
XORL 56(SI),R8
XORL 60(SI),R9
MOVL DX,48(DI)
MOVL CX,52(DI)
MOVL R8,56(DI)
MOVL R9,60(DI)
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
PSHUFL $0X39,X13,X13
PSHUFL $0X39,X9,X9
PSHUFL $0X39,X3,X3
PSHUFL $0X39,X2,X2
XORL 112(SI),DX
XORL 116(SI),CX
XORL 120(SI),R8
XORL 124(SI),R9
MOVL DX,112(DI)
MOVL CX,116(DI)
MOVL R8,120(DI)
MOVL R9,124(DI)
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
PSHUFL $0X39,X13,X13
PSHUFL $0X39,X9,X9
PSHUFL $0X39,X3,X3
PSHUFL $0X39,X2,X2
XORL 176(SI),DX
XORL 180(SI),CX
XORL 184(SI),R8
XORL 188(SI),R9
MOVL DX,176(DI)
MOVL CX,180(DI)
MOVL R8,184(DI)
MOVL R9,188(DI)
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
XORL 240(SI),DX
XORL 244(SI),CX
XORL 248(SI),R8
XORL 252(SI),R9
MOVL DX,240(DI)
MOVL CX,244(DI)
MOVL R8,248(DI)
MOVL R9,252(DI)
MOVQ 352(R12),R9
SUBQ $256,R9
ADDQ $256,SI
ADDQ $256,DI
CMPQ R9,$256
JAE BYTESATLEAST256
CMPQ R9,$0
JBE DONE
BYTESBETWEEN1AND255:
CMPQ R9,$64
JAE NOCOPY
MOVQ DI,DX
LEAQ 360(R12),DI
MOVQ R9,CX
REP; MOVSB
LEAQ 360(R12),DI
LEAQ 360(R12),SI
NOCOPY:
MOVQ R9,352(R12)
MOVOA 48(R12),X0
MOVOA 0(R12),X1
MOVOA 16(R12),X2
MOVOA 32(R12),X3
MOVOA X1,X4
MOVQ $20,CX
MAINLOOP2:
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X3
PXOR X6,X3
PADDL X3,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X3,X3
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X1
PSHUFL $0X4E,X2,X2
PXOR X6,X1
PADDL X1,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $18,X5
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X1,X1
PXOR X6,X0
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X1
PXOR X6,X1
PADDL X1,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X1,X1
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X3
PSHUFL $0X4E,X2,X2
PXOR X6,X3
PADDL X3,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $18,X5
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X3,X3
PXOR X6,X0
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X3
PXOR X6,X3
PADDL X3,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X3,X3
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X1
PSHUFL $0X4E,X2,X2
PXOR X6,X1
PADDL X1,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $18,X5
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X1,X1
PXOR X6,X0
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X1
PXOR X6,X1
PADDL X1,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X1,X1
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X3
PSHUFL $0X4E,X2,X2
PXOR X6,X3
SUBQ $4,CX
PADDL X3,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $18,X5
PXOR X7,X7
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X3,X3
PXOR X6,X0
JA MAINLOOP2
PADDL 48(R12),X0
PADDL 0(R12),X1
PADDL 16(R12),X2
PADDL 32(R12),X3
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X2,X2
PSHUFL $0X39,X3,X3
XORL 0(SI),CX
XORL 48(SI),R8
XORL 32(SI),R9
XORL 16(SI),AX
MOVL CX,0(DI)
MOVL R8,48(DI)
MOVL R9,32(DI)
MOVL AX,16(DI)
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X2,X2
PSHUFL $0X39,X3,X3
XORL 20(SI),CX
XORL 4(SI),R8
XORL 52(SI),R9
XORL 36(SI),AX
MOVL CX,20(DI)
MOVL R8,4(DI)
MOVL R9,52(DI)
MOVL AX,36(DI)
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X2,X2
PSHUFL $0X39,X3,X3
XORL 40(SI),CX
XORL 24(SI),R8
XORL 8(SI),R9
XORL 56(SI),AX
MOVL CX,40(DI)
MOVL R8,24(DI)
MOVL R9,8(DI)
MOVL AX,56(DI)
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
XORL 60(SI),CX
XORL 44(SI),R8
XORL 28(SI),R9
XORL 12(SI),AX
MOVL CX,60(DI)
MOVL R8,44(DI)
MOVL R9,28(DI)
MOVL AX,12(DI)
MOVQ 352(R12),R9
MOVL 16(R12),CX
MOVL 36 (R12),R8
ADDQ $1,CX
SHLQ $32,R8
ADDQ R8,CX
MOVQ CX,R8
SHRQ $32,R8
MOVL CX,16(R12)
MOVL R8, 36 (R12)
CMPQ R9,$64
JA BYTESATLEAST65
JAE BYTESATLEAST64
MOVQ DI,SI
MOVQ DX,DI
MOVQ R9,CX
REP; MOVSB
BYTESATLEAST64:
DONE:
RET
BYTESATLEAST65:
SUBQ $64,R9
ADDQ $64,DI
ADDQ $64,SI
JMP BYTESBETWEEN1AND255

View File

@ -0,0 +1,14 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || purego || !gc
package salsa
// XORKeyStream crypts bytes from in to out using the given key and counters.
// In and out must overlap entirely or not at all. Counter
// contains the raw salsa20 counter bytes (both nonce and block counter).
func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
genericXORKeyStream(out, in, counter, key)
}

233
vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go generated vendored Normal file
View File

@ -0,0 +1,233 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package salsa
import "math/bits"
const rounds = 20
// core applies the Salsa20 core function to 16-byte input in, 32-byte key k,
// and 16-byte constant c, and puts the result into 64-byte array out.
func core(out *[64]byte, in *[16]byte, k *[32]byte, c *[16]byte) {
j0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24
j1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24
j2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24
j3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24
j4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24
j5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24
j6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
j7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
j8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
j9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
j10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24
j11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24
j12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24
j13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24
j14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24
j15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24
x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8
x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15
for i := 0; i < rounds; i += 2 {
u := x0 + x12
x4 ^= bits.RotateLeft32(u, 7)
u = x4 + x0
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x4
x12 ^= bits.RotateLeft32(u, 13)
u = x12 + x8
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x1
x9 ^= bits.RotateLeft32(u, 7)
u = x9 + x5
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x9
x1 ^= bits.RotateLeft32(u, 13)
u = x1 + x13
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x6
x14 ^= bits.RotateLeft32(u, 7)
u = x14 + x10
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x14
x6 ^= bits.RotateLeft32(u, 13)
u = x6 + x2
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x11
x3 ^= bits.RotateLeft32(u, 7)
u = x3 + x15
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x3
x11 ^= bits.RotateLeft32(u, 13)
u = x11 + x7
x15 ^= bits.RotateLeft32(u, 18)
u = x0 + x3
x1 ^= bits.RotateLeft32(u, 7)
u = x1 + x0
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x1
x3 ^= bits.RotateLeft32(u, 13)
u = x3 + x2
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x4
x6 ^= bits.RotateLeft32(u, 7)
u = x6 + x5
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x6
x4 ^= bits.RotateLeft32(u, 13)
u = x4 + x7
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x9
x11 ^= bits.RotateLeft32(u, 7)
u = x11 + x10
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x11
x9 ^= bits.RotateLeft32(u, 13)
u = x9 + x8
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x14
x12 ^= bits.RotateLeft32(u, 7)
u = x12 + x15
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x12
x14 ^= bits.RotateLeft32(u, 13)
u = x14 + x13
x15 ^= bits.RotateLeft32(u, 18)
}
x0 += j0
x1 += j1
x2 += j2
x3 += j3
x4 += j4
x5 += j5
x6 += j6
x7 += j7
x8 += j8
x9 += j9
x10 += j10
x11 += j11
x12 += j12
x13 += j13
x14 += j14
x15 += j15
out[0] = byte(x0)
out[1] = byte(x0 >> 8)
out[2] = byte(x0 >> 16)
out[3] = byte(x0 >> 24)
out[4] = byte(x1)
out[5] = byte(x1 >> 8)
out[6] = byte(x1 >> 16)
out[7] = byte(x1 >> 24)
out[8] = byte(x2)
out[9] = byte(x2 >> 8)
out[10] = byte(x2 >> 16)
out[11] = byte(x2 >> 24)
out[12] = byte(x3)
out[13] = byte(x3 >> 8)
out[14] = byte(x3 >> 16)
out[15] = byte(x3 >> 24)
out[16] = byte(x4)
out[17] = byte(x4 >> 8)
out[18] = byte(x4 >> 16)
out[19] = byte(x4 >> 24)
out[20] = byte(x5)
out[21] = byte(x5 >> 8)
out[22] = byte(x5 >> 16)
out[23] = byte(x5 >> 24)
out[24] = byte(x6)
out[25] = byte(x6 >> 8)
out[26] = byte(x6 >> 16)
out[27] = byte(x6 >> 24)
out[28] = byte(x7)
out[29] = byte(x7 >> 8)
out[30] = byte(x7 >> 16)
out[31] = byte(x7 >> 24)
out[32] = byte(x8)
out[33] = byte(x8 >> 8)
out[34] = byte(x8 >> 16)
out[35] = byte(x8 >> 24)
out[36] = byte(x9)
out[37] = byte(x9 >> 8)
out[38] = byte(x9 >> 16)
out[39] = byte(x9 >> 24)
out[40] = byte(x10)
out[41] = byte(x10 >> 8)
out[42] = byte(x10 >> 16)
out[43] = byte(x10 >> 24)
out[44] = byte(x11)
out[45] = byte(x11 >> 8)
out[46] = byte(x11 >> 16)
out[47] = byte(x11 >> 24)
out[48] = byte(x12)
out[49] = byte(x12 >> 8)
out[50] = byte(x12 >> 16)
out[51] = byte(x12 >> 24)
out[52] = byte(x13)
out[53] = byte(x13 >> 8)
out[54] = byte(x13 >> 16)
out[55] = byte(x13 >> 24)
out[56] = byte(x14)
out[57] = byte(x14 >> 8)
out[58] = byte(x14 >> 16)
out[59] = byte(x14 >> 24)
out[60] = byte(x15)
out[61] = byte(x15 >> 8)
out[62] = byte(x15 >> 16)
out[63] = byte(x15 >> 24)
}
// genericXORKeyStream is the generic implementation of XORKeyStream to be used
// when no assembly implementation is available.
func genericXORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
var block [64]byte
var counterCopy [16]byte
copy(counterCopy[:], counter[:])
for len(in) >= 64 {
core(&block, &counterCopy, key, &Sigma)
for i, x := range block {
out[i] = in[i] ^ x
}
u := uint32(1)
for i := 8; i < 16; i++ {
u += uint32(counterCopy[i])
counterCopy[i] = byte(u)
u >>= 8
}
in = in[64:]
out = out[64:]
}
if len(in) > 0 {
core(&block, &counterCopy, key, &Sigma)
for i, v := range in {
out[i] = v ^ block[i]
}
}
}