From 559900fd3ac0135ab0e5f453fe4ba4b02d66abe1 Mon Sep 17 00:00:00 2001 From: La Ancapo <> Date: Mon, 26 Jan 2026 23:09:58 +0100 Subject: [PATCH] basic key encryption --- README.md | 14 + bundled/Crypto/KDF/Argon2.hs | 157 ++++ bundled/cbits/argon2/argon2.c | 142 ++++ bundled/cbits/argon2/argon2.h | 267 +++++++ bundled/cbits/argon2/blamka-round-opt.h | 180 +++++ bundled/cbits/argon2/blamka-round-ref.h | 56 ++ bundled/cbits/argon2/core.c | 701 ++++++++++++++++++ bundled/cbits/argon2/core.h | 234 ++++++ bundled/cbits/argon2/opt.c | 186 +++++ bundled/cbits/argon2/opt.h | 35 + bundled/cbits/argon2/ref.c | 185 +++++ bundled/cbits/argon2/ref.h | 35 + bundled/cbits/argon2/thread.c | 57 ++ bundled/cbits/argon2/thread.h | 67 ++ bundled/cbits/blake2/sse/blake2-config.h | 72 ++ bundled/cbits/blake2/sse/blake2-impl.h | 160 ++++ bundled/cbits/blake2/sse/blake2.h | 195 +++++ bundled/cbits/blake2/sse/blake2b-load-sse2.h | 68 ++ bundled/cbits/blake2/sse/blake2b-load-sse41.h | 402 ++++++++++ bundled/cbits/blake2/sse/blake2b-round.h | 157 ++++ bundled/cbits/blake2/sse/blake2b.c | 373 ++++++++++ bundled/cbits/blake2/sse/blake2bp.c | 361 +++++++++ bundled/cbits/blake2/sse/blake2s-load-sse2.h | 60 ++ bundled/cbits/blake2/sse/blake2s-load-sse41.h | 229 ++++++ bundled/cbits/blake2/sse/blake2s-load-xop.h | 191 +++++ bundled/cbits/blake2/sse/blake2s-round.h | 88 +++ bundled/cbits/blake2/sse/blake2s.c | 363 +++++++++ bundled/cbits/blake2/sse/blake2sp.c | 358 +++++++++ src/Encryption.hs | 74 ++ src/Main.hs | 27 +- stellar-veritas.cabal | 11 +- 31 files changed, 5501 insertions(+), 4 deletions(-) create mode 100644 bundled/Crypto/KDF/Argon2.hs create mode 100644 bundled/cbits/argon2/argon2.c create mode 100644 bundled/cbits/argon2/argon2.h create mode 100644 bundled/cbits/argon2/blamka-round-opt.h create mode 100644 bundled/cbits/argon2/blamka-round-ref.h create mode 100644 bundled/cbits/argon2/core.c create mode 100644 bundled/cbits/argon2/core.h create mode 100644 bundled/cbits/argon2/opt.c create mode 100644 bundled/cbits/argon2/opt.h create mode 100644 bundled/cbits/argon2/ref.c create mode 100644 bundled/cbits/argon2/ref.h create mode 100644 bundled/cbits/argon2/thread.c create mode 100644 bundled/cbits/argon2/thread.h create mode 100644 bundled/cbits/blake2/sse/blake2-config.h create mode 100644 bundled/cbits/blake2/sse/blake2-impl.h create mode 100644 bundled/cbits/blake2/sse/blake2.h create mode 100644 bundled/cbits/blake2/sse/blake2b-load-sse2.h create mode 100644 bundled/cbits/blake2/sse/blake2b-load-sse41.h create mode 100644 bundled/cbits/blake2/sse/blake2b-round.h create mode 100644 bundled/cbits/blake2/sse/blake2b.c create mode 100644 bundled/cbits/blake2/sse/blake2bp.c create mode 100644 bundled/cbits/blake2/sse/blake2s-load-sse2.h create mode 100644 bundled/cbits/blake2/sse/blake2s-load-sse41.h create mode 100644 bundled/cbits/blake2/sse/blake2s-load-xop.h create mode 100644 bundled/cbits/blake2/sse/blake2s-round.h create mode 100644 bundled/cbits/blake2/sse/blake2s.c create mode 100644 bundled/cbits/blake2/sse/blake2sp.c create mode 100644 src/Encryption.hs diff --git a/README.md b/README.md index 8492470..cb693ba 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,20 @@ The aim is to create a trustworthy Stellar transaction signer (and, by necessity, a pretty printer) using only Glasgow Haskell compiler code and Haskell Core libraries, reducing the possible supply chain attack surface. +## Installation and usage + To build and run it, install `cabal-install` and use `cabal run`. The program expects your transaction coming from the standard input, and your private (S...) key residing in `$HOME/~/.stellar-veritas-key`. It will produce the decoded transaction description afterwards, with simple descriptions of the most popular operations. When it is ran from an interactive terminal, it will ask for a confirmation before signing the transaction. +You can put the binary (`cabal list-bin stellar-veritas` to get the path) into your `$PATH` and use it directly. + +## Encryption + +The project supports key encryption. Encrypting your key protects you from key file theft (cracking a 8-character password is estimated to cost over $1M (in 2026), scales with password complexity and the performance of the computer at the moment of encryption). + +To encrypt your key, run `cabal run stellar-veritas -- encrypt` (or just `stellar-veritas encrypt` after installation) and enter your password. Afterwards you'll be prompted your password every time you want to sign anything. `encrypt`ing an already-encrpyted key would result in decryption and re-encryption with new parameters and salt. + +## Technical details + The project contains the code of trimmed-down non-core dependencies, mainly cryptographic libraries. To avoid using bundled libraries (to build against the current Hackage), do the same in the `src` directory. To further reduce the amount of code under audit, weeder can be used, although the utility is dubious. + +Key encryption is implemented via XOR with Argon2i KDF with an adaptive iteration count. diff --git a/bundled/Crypto/KDF/Argon2.hs b/bundled/Crypto/KDF/Argon2.hs new file mode 100644 index 0000000..044ba00 --- /dev/null +++ b/bundled/Crypto/KDF/Argon2.hs @@ -0,0 +1,157 @@ +-- | +-- Module : Crypto.KDF.Argon2 +-- License : BSD-style +-- Maintainer : Vincent Hanquez +-- Stability : experimental +-- Portability : unknown +-- +-- Argon2 hashing function (P-H-C winner) +-- +-- Recommended to use this module qualified +-- +-- File started from Argon2.hs, from Oliver Charles +-- at https://github.com/ocharles/argon2 +-- +module Crypto.KDF.Argon2 + ( + Options(..) + , TimeCost + , MemoryCost + , Parallelism + , Variant(..) + , Version(..) + , defaultOptions + -- * Hashing function + , hash + ) where + +import Crypto.Internal.ByteArray (ByteArray, ByteArrayAccess) +import qualified Crypto.Internal.ByteArray as B +import Crypto.Error +import Control.Monad (when) +import Data.Word +import Foreign.C +import Foreign.Ptr + +-- | Which variant of Argon2 to use. You should choose the variant that is most +-- applicable to your intention to hash inputs. +data Variant = + Argon2d -- ^ Argon2d is faster than Argon2i and uses data-depending memory access, + -- which makes it suitable for cryptocurrencies and applications with no + -- threats from side-channel timing attacks. + | Argon2i -- ^ Argon2i uses data-independent memory access, which is preferred + -- for password hashing and password-based key derivation. Argon2i + -- is slower as it makes more passes over the memory to protect from + -- tradeoff attacks. + | Argon2id -- ^ Argon2id is a hybrid of Argon2i and Argon2d, using a combination + -- of data-depending and data-independent memory accesses, which gives + -- some of Argon2i's resistance to side-channel cache timing attacks + -- and much of Argon2d's resistance to GPU cracking attacks + deriving (Eq,Ord,Read,Show,Enum,Bounded) + +-- | Which version of Argon2 to use +data Version = Version10 | Version13 + deriving (Eq,Ord,Read,Show,Enum,Bounded) + +-- | The time cost, which defines the amount of computation realized and therefore the execution time, given in number of iterations. +-- +-- 'FFI.ARGON2_MIN_TIME' <= 'hashIterations' <= 'FFI.ARGON2_MAX_TIME' +type TimeCost = Word32 + +-- | The memory cost, which defines the memory usage, given in kibibytes. +-- +-- max 'FFI.ARGON2_MIN_MEMORY' (8 * 'hashParallelism') <= 'hashMemory' <= 'FFI.ARGON2_MAX_MEMORY' +type MemoryCost = Word32 + +-- | A parallelism degree, which defines the number of parallel threads. +-- +-- 'FFI.ARGON2_MIN_LANES' <= 'hashParallelism' <= 'FFI.ARGON2_MAX_LANES' && 'FFI.ARGON_MIN_THREADS' <= 'hashParallelism' <= 'FFI.ARGON2_MAX_THREADS' +type Parallelism = Word32 + +-- | Parameters that can be adjusted to change the runtime performance of the +-- hashing. +data Options = Options + { iterations :: !TimeCost + , memory :: !MemoryCost + , parallelism :: !Parallelism + , variant :: !Variant -- ^ Which variant of Argon2 to use. + , version :: !Version -- ^ Which version of Argon2 to use. + } + deriving (Eq,Ord,Read,Show) + +saltMinLength :: Int +saltMinLength = 8 + +outputMinLength :: Int +outputMinLength = 4 + +-- specification allows up to 2^32-1 but this is too big for a signed Int +-- on a 32-bit architecture, so we limit tag length to 2^31-1 bytes +outputMaxLength :: Int +outputMaxLength = 0x7fffffff + +defaultOptions :: Options +defaultOptions = + Options { iterations = 1 + , memory = 2 ^ (17 :: Int) + , parallelism = 4 + , variant = Argon2i + , version = Version13 + } + +hash :: (ByteArrayAccess password, ByteArrayAccess salt, ByteArray out) + => Options + -> password + -> salt + -> Int + -> CryptoFailable out +hash options password salt outLen + | saltLen < saltMinLength = CryptoFailed CryptoError_SaltTooSmall + | outLen < outputMinLength = CryptoFailed CryptoError_OutputLengthTooSmall + | outLen > outputMaxLength = CryptoFailed CryptoError_OutputLengthTooBig + | otherwise = CryptoPassed $ B.allocAndFreeze outLen $ \out -> do + res <- B.withByteArray password $ \pPass -> + B.withByteArray salt $ \pSalt -> + argon2_hash (iterations options) + (memory options) + (parallelism options) + pPass + (csizeOfInt passwordLen) + pSalt + (csizeOfInt saltLen) + out + (csizeOfInt outLen) + (cOfVariant $ variant options) + (cOfVersion $ version options) + when (res /= 0) $ error "argon2: hash: internal error" + where + saltLen = B.length salt + passwordLen = B.length password + +data Pass +data Salt +data HashOut + +type CVariant = CInt -- valid value is 0 (Argon2d), 1 (Argon2i) and 2 (Argon2id) +type CVersion = CInt -- valid value is 0x10, 0x13 + +cOfVersion :: Version -> CVersion +cOfVersion Version10 = 0x10 +cOfVersion Version13 = 0x13 + +cOfVariant :: Variant -> CVariant +cOfVariant Argon2d = 0 +cOfVariant Argon2i = 1 +cOfVariant Argon2id = 2 + +csizeOfInt :: Int -> CSize +csizeOfInt = fromIntegral + +foreign import ccall unsafe "cryptonite_argon2_hash" + argon2_hash :: Word32 -> Word32 -> Word32 + -> Ptr Pass -> CSize + -> Ptr Salt -> CSize + -> Ptr HashOut -> CSize + -> CVariant + -> CVersion + -> IO CInt diff --git a/bundled/cbits/argon2/argon2.c b/bundled/cbits/argon2/argon2.c new file mode 100644 index 0000000..717522e --- /dev/null +++ b/bundled/cbits/argon2/argon2.c @@ -0,0 +1,142 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#include +#include +#include + +#include "argon2.h" +#include "core.c" + +int cryptonite_argon2_ctx(argon2_context *context, argon2_type type) { + /* 1. Validate all inputs */ + int result = validate_inputs(context); + uint32_t memory_blocks, segment_length; + argon2_instance_t instance; + + if (ARGON2_OK != result) { + return result; + } + + if (Argon2_d != type && Argon2_i != type && Argon2_id != type) { + return ARGON2_INCORRECT_TYPE; + } + + /* 2. Align memory size */ + /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */ + memory_blocks = context->m_cost; + + if (memory_blocks < 2 * ARGON2_SYNC_POINTS * context->lanes) { + memory_blocks = 2 * ARGON2_SYNC_POINTS * context->lanes; + } + + segment_length = memory_blocks / (context->lanes * ARGON2_SYNC_POINTS); + /* Ensure that all segments have equal length */ + memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS); + + instance.version = context->version; + instance.memory = NULL; + instance.passes = context->t_cost; + instance.memory_blocks = memory_blocks; + instance.segment_length = segment_length; + instance.lane_length = segment_length * ARGON2_SYNC_POINTS; + instance.lanes = context->lanes; + instance.threads = context->threads; + instance.type = type; + + /* 3. Initialization: Hashing inputs, allocating memory, filling first + * blocks + */ + result = initialize(&instance, context); + + if (ARGON2_OK != result) { + return result; + } + + /* 4. Filling memory */ + result = fill_memory_blocks(&instance); + + if (ARGON2_OK != result) { + return result; + } + /* 5. Finalization */ + finalize(context, &instance); + + return ARGON2_OK; +} + +int cryptonite_argon2_hash(const uint32_t t_cost, const uint32_t m_cost, + const uint32_t parallelism, const void *pwd, + const size_t pwdlen, const void *salt, const size_t saltlen, + void *hash, const size_t hashlen, argon2_type type, + const uint32_t version){ + + argon2_context context; + int result; + uint8_t *out; + + if (hashlen > ARGON2_MAX_OUTLEN) { + return ARGON2_OUTPUT_TOO_LONG; + } + + if (hashlen < ARGON2_MIN_OUTLEN) { + return ARGON2_OUTPUT_TOO_SHORT; + } + + out = malloc(hashlen); + if (!out) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + context.out = (uint8_t *)out; + context.outlen = (uint32_t)hashlen; + context.pwd = CONST_CAST(uint8_t *)pwd; + context.pwdlen = (uint32_t)pwdlen; + context.salt = CONST_CAST(uint8_t *)salt; + context.saltlen = (uint32_t)saltlen; + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.t_cost = t_cost; + context.m_cost = m_cost; + context.lanes = parallelism; + context.threads = parallelism; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = ARGON2_DEFAULT_FLAGS; + context.version = version; + + result = cryptonite_argon2_ctx(&context, type); + + if (result != ARGON2_OK) { + clear_internal_memory(out, hashlen); + free(out); + return result; + } + + /* if raw hash requested, write it */ + if (hash) { + memcpy(hash, out, hashlen); + } + + clear_internal_memory(out, hashlen); + free(out); + + return ARGON2_OK; +} + diff --git a/bundled/cbits/argon2/argon2.h b/bundled/cbits/argon2/argon2.h new file mode 100644 index 0000000..a4a088f --- /dev/null +++ b/bundled/cbits/argon2/argon2.h @@ -0,0 +1,267 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef ARGON2_H +#define ARGON2_H + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Symbols visibility control */ +#ifdef A2_VISCTL +#define ARGON2_PUBLIC __attribute__((visibility("default"))) +#elif _MSC_VER +#define ARGON2_PUBLIC __declspec(dllexport) +#else +#define ARGON2_PUBLIC +#endif + +/* + * Argon2 input parameter restrictions + */ + +/* Minimum and maximum number of lanes (degree of parallelism) */ +#define ARGON2_MIN_LANES UINT32_C(1) +#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF) + +/* Minimum and maximum number of threads */ +#define ARGON2_MIN_THREADS UINT32_C(1) +#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF) + +/* Number of synchronization points between lanes per pass */ +#define ARGON2_SYNC_POINTS UINT32_C(4) + +/* Minimum and maximum digest size in bytes */ +#define ARGON2_MIN_OUTLEN UINT32_C(4) +#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */ +#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */ + +#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b)) +/* Max memory size is addressing-space/2, topping at 2^32 blocks (4 TB) */ +#define ARGON2_MAX_MEMORY_BITS \ + ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1)) +#define ARGON2_MAX_MEMORY \ + ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS) + +/* Minimum and maximum number of passes */ +#define ARGON2_MIN_TIME UINT32_C(1) +#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum password length in bytes */ +#define ARGON2_MIN_PWD_LENGTH UINT32_C(0) +#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum associated data length in bytes */ +#define ARGON2_MIN_AD_LENGTH UINT32_C(0) +#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum salt length in bytes */ +#define ARGON2_MIN_SALT_LENGTH UINT32_C(8) +#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum key length in bytes */ +#define ARGON2_MIN_SECRET UINT32_C(0) +#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF) + +/* Flags to determine which fields are securely wiped (default = no wipe). */ +#define ARGON2_DEFAULT_FLAGS UINT32_C(0) +#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0) +#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1) + +/* Global flag to determine if we are wiping internal memory buffers. This flag + * is defined in core.c and deafults to 1 (wipe internal memory). */ +//extern int FLAG_clear_internal_memory; + +/* Error codes */ +typedef enum Argon2_ErrorCodes { + ARGON2_OK = 0, + + ARGON2_OUTPUT_PTR_NULL = -1, + + ARGON2_OUTPUT_TOO_SHORT = -2, + ARGON2_OUTPUT_TOO_LONG = -3, + + ARGON2_PWD_TOO_SHORT = -4, + ARGON2_PWD_TOO_LONG = -5, + + ARGON2_SALT_TOO_SHORT = -6, + ARGON2_SALT_TOO_LONG = -7, + + ARGON2_AD_TOO_SHORT = -8, + ARGON2_AD_TOO_LONG = -9, + + ARGON2_SECRET_TOO_SHORT = -10, + ARGON2_SECRET_TOO_LONG = -11, + + ARGON2_TIME_TOO_SMALL = -12, + ARGON2_TIME_TOO_LARGE = -13, + + ARGON2_MEMORY_TOO_LITTLE = -14, + ARGON2_MEMORY_TOO_MUCH = -15, + + ARGON2_LANES_TOO_FEW = -16, + ARGON2_LANES_TOO_MANY = -17, + + ARGON2_PWD_PTR_MISMATCH = -18, /* NULL ptr with non-zero length */ + ARGON2_SALT_PTR_MISMATCH = -19, /* NULL ptr with non-zero length */ + ARGON2_SECRET_PTR_MISMATCH = -20, /* NULL ptr with non-zero length */ + ARGON2_AD_PTR_MISMATCH = -21, /* NULL ptr with non-zero length */ + + ARGON2_MEMORY_ALLOCATION_ERROR = -22, + + ARGON2_FREE_MEMORY_CBK_NULL = -23, + ARGON2_ALLOCATE_MEMORY_CBK_NULL = -24, + + ARGON2_INCORRECT_PARAMETER = -25, + ARGON2_INCORRECT_TYPE = -26, + + ARGON2_OUT_PTR_MISMATCH = -27, + + ARGON2_THREADS_TOO_FEW = -28, + ARGON2_THREADS_TOO_MANY = -29, + + ARGON2_MISSING_ARGS = -30, + + ARGON2_ENCODING_FAIL = -31, + + ARGON2_DECODING_FAIL = -32, + + ARGON2_THREAD_FAIL = -33, + + ARGON2_DECODING_LENGTH_FAIL = -34, + + ARGON2_VERIFY_MISMATCH = -35 +} argon2_error_codes; + +/* Memory allocator types --- for external allocation */ +typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate); +typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate); + +/* Argon2 external data structures */ + +/* + ***** + * Context: structure to hold Argon2 inputs: + * output array and its length, + * password and its length, + * salt and its length, + * secret and its length, + * associated data and its length, + * number of passes, amount of used memory (in KBytes, can be rounded up a bit) + * number of parallel threads that will be run. + * All the parameters above affect the output hash value. + * Additionally, two function pointers can be provided to allocate and + * deallocate the memory (if NULL, memory will be allocated internally). + * Also, three flags indicate whether to erase password, secret as soon as they + * are pre-hashed (and thus not needed anymore), and the entire memory + ***** + * Simplest situation: you have output array out[8], password is stored in + * pwd[32], salt is stored in salt[16], you do not have keys nor associated + * data. You need to spend 1 GB of RAM and you run 5 passes of Argon2d with + * 4 parallel lanes. + * You want to erase the password, but you're OK with last pass not being + * erased. You want to use the default memory allocator. + * Then you initialize: + Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,4,NULL,NULL,true,false,false,false) + */ +typedef struct Argon2_Context { + uint8_t *out; /* output array */ + uint32_t outlen; /* digest length */ + + uint8_t *pwd; /* password array */ + uint32_t pwdlen; /* password length */ + + uint8_t *salt; /* salt array */ + uint32_t saltlen; /* salt length */ + + uint8_t *secret; /* key array */ + uint32_t secretlen; /* key length */ + + uint8_t *ad; /* associated data array */ + uint32_t adlen; /* associated data length */ + + uint32_t t_cost; /* number of passes */ + uint32_t m_cost; /* amount of memory requested (KB) */ + uint32_t lanes; /* number of lanes */ + uint32_t threads; /* maximum number of threads */ + + uint32_t version; /* version number */ + + allocate_fptr allocate_cbk; /* pointer to memory allocator */ + deallocate_fptr free_cbk; /* pointer to memory deallocator */ + + uint32_t flags; /* array of bool options */ +} argon2_context; + +/* Argon2 primitive type */ +typedef enum Argon2_type { + Argon2_d = 0, + Argon2_i = 1, + Argon2_id = 2 +} argon2_type; + +/* Version of the algorithm */ +typedef enum Argon2_version { + ARGON2_VERSION_10 = 0x10, + ARGON2_VERSION_13 = 0x13, + ARGON2_VERSION_NUMBER = ARGON2_VERSION_13 +} argon2_version; + +/* + * Function that performs memory-hard hashing with certain degree of parallelism + * @param context Pointer to the Argon2 internal structure + * @return Error code if smth is wrong, ARGON2_OK otherwise + */ +ARGON2_PUBLIC int cryptonite_argon2_ctx(argon2_context *context, argon2_type type); + +/** + * Hashes a password with Argon2i, producing a raw hash by allocating memory at + * @hash + * @param t_cost Number of iterations + * @param m_cost Sets memory usage to m_cost kibibytes + * @param parallelism Number of threads and compute lanes + * @param pwd Pointer to password + * @param pwdlen Password size in bytes + * @param salt Pointer to salt + * @param saltlen Salt size in bytes + * @param hash Buffer where to write the raw hash - updated by the function + * @param hashlen Desired length of the hash in bytes + * @pre Different parallelism levels will give different results + * @pre Returns ARGON2_OK if successful + */ + +/* generic function underlying the above ones */ +ARGON2_PUBLIC int cryptonite_argon2_hash(const uint32_t t_cost, const uint32_t m_cost, + const uint32_t parallelism, const void *pwd, + const size_t pwdlen, const void *salt, + const size_t saltlen, void *hash, + const size_t hashlen, argon2_type type, + const uint32_t version); + + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/bundled/cbits/argon2/blamka-round-opt.h b/bundled/cbits/argon2/blamka-round-opt.h new file mode 100644 index 0000000..577e1d0 --- /dev/null +++ b/bundled/cbits/argon2/blamka-round-opt.h @@ -0,0 +1,180 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef BLAKE_ROUND_MKA_OPT_H +#define BLAKE_ROUND_MKA_OPT_H + +#include "blake2-impl.h" + +#include +#if defined(__SSSE3__) +#include /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ +#endif + +#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__)) +#include +#endif + +#if !defined(__XOP__) +#if defined(__SSSE3__) +#define r16 \ + (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +#define r24 \ + (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) \ + ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ + : (-(c) == 24) \ + ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) \ + ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) \ + ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_slli_epi64((x), 64 - (-(c)))) +#else /* defined(__SSE2__) */ +#define _mm_roti_epi64(r, c) \ + _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) +#endif +#else +#endif + +static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { + const __m128i z = _mm_mul_epu32(x, y); + return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); +} + +#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -32); \ + D1 = _mm_roti_epi64(D1, -32); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -24); \ + B1 = _mm_roti_epi64(B1, -24); \ + } while ((void)0, 0) + +#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + A0 = fBlaMka(A0, B0); \ + A1 = fBlaMka(A1, B1); \ + \ + D0 = _mm_xor_si128(D0, A0); \ + D1 = _mm_xor_si128(D1, A1); \ + \ + D0 = _mm_roti_epi64(D0, -16); \ + D1 = _mm_roti_epi64(D1, -16); \ + \ + C0 = fBlaMka(C0, D0); \ + C1 = fBlaMka(C1, D1); \ + \ + B0 = _mm_xor_si128(B0, C0); \ + B1 = _mm_xor_si128(B1, C1); \ + \ + B0 = _mm_roti_epi64(B0, -63); \ + B1 = _mm_roti_epi64(B1, -63); \ + } while ((void)0, 0) + +#if defined(__SSSE3__) +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ + __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D1, D0, 8); \ + t1 = _mm_alignr_epi8(D0, D1, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ + __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ + B0 = t0; \ + B1 = t1; \ + \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + \ + t0 = _mm_alignr_epi8(D0, D1, 8); \ + t1 = _mm_alignr_epi8(D1, D0, 8); \ + D0 = t1; \ + D1 = t0; \ + } while ((void)0, 0) +#else /* SSE2 */ +#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0 = D0; \ + __m128i t1 = B0; \ + D0 = C0; \ + C0 = C1; \ + C1 = D0; \ + D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ + D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ + B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ + B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ + } while ((void)0, 0) + +#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ + do { \ + __m128i t0, t1; \ + t0 = C0; \ + C0 = C1; \ + C1 = t0; \ + t0 = B0; \ + t1 = D0; \ + B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ + B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ + D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ + D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ + } while ((void)0, 0) +#endif + +#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ + do { \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + G1(A0, B0, C0, D0, A1, B1, C1, D1); \ + G2(A0, B0, C0, D0, A1, B1, C1, D1); \ + \ + UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ + } while ((void)0, 0) + +#endif diff --git a/bundled/cbits/argon2/blamka-round-ref.h b/bundled/cbits/argon2/blamka-round-ref.h new file mode 100644 index 0000000..db9864d --- /dev/null +++ b/bundled/cbits/argon2/blamka-round-ref.h @@ -0,0 +1,56 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef BLAKE_ROUND_MKA_H +#define BLAKE_ROUND_MKA_H + +#include "blake2.h" +#include "blake2-impl.h" + +/*designed by the Lyra PHC team */ +static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { + const uint64_t m = UINT64_C(0xFFFFFFFF); + const uint64_t xy = (x & m) * (y & m); + return x + y + 2 * xy; +} + +#define G(a, b, c, d) \ + do { \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 32); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 24); \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 16); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \ + v12, v13, v14, v15) \ + do { \ + G(v0, v4, v8, v12); \ + G(v1, v5, v9, v13); \ + G(v2, v6, v10, v14); \ + G(v3, v7, v11, v15); \ + G(v0, v5, v10, v15); \ + G(v1, v6, v11, v12); \ + G(v2, v7, v8, v13); \ + G(v3, v4, v9, v14); \ + } while ((void)0, 0) + +#endif diff --git a/bundled/cbits/argon2/core.c b/bundled/cbits/argon2/core.c new file mode 100644 index 0000000..cc22a74 --- /dev/null +++ b/bundled/cbits/argon2/core.c @@ -0,0 +1,701 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +/*For memory wiping*/ +#ifdef _MSC_VER +#include +#include /* For SecureZeroMemory */ +#endif +#if defined __STDC_LIB_EXT1__ +#define __STDC_WANT_LIB_EXT1__ 1 +#endif +#define VC_GE_2005(version) (version >= 1400) + +#include +#include +#include +#include + +#include "core.h" +#include "thread.c" +#include "blake2.h" +#include "blake2-impl.h" + +#ifdef GENKAT +#include "genkat.h" +#endif + +#ifdef SUPPORT_SSE +#include "opt.c" +#else +#include "ref.c" +#endif + +#if defined(__clang__) +#if __has_attribute(optnone) +#define NOT_OPTIMIZED __attribute__((optnone)) +#endif +#elif defined(__GNUC__) +#define GCC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#if GCC_VERSION >= 40400 +#define NOT_OPTIMIZED __attribute__((optimize("O0"))) +#endif +#endif +#ifndef NOT_OPTIMIZED +#define NOT_OPTIMIZED +#endif + +/* Argon2 Team - Begin Code */ +static int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { + uint8_t *out = (uint8_t *)pout; + blake2b_state blake_state; + uint8_t outlen_bytes[sizeof(uint32_t)] = {0}; + int ret = -1; + + if (outlen > UINT32_MAX) { + goto fail; + } + + /* Ensure little-endian byte order! */ + store32(outlen_bytes, (uint32_t)outlen); + +#define TRY(statement) \ + do { \ + ret = statement; \ + if (ret < 0) { \ + goto fail; \ + } \ + } while ((void)0, 0) + + if (outlen <= BLAKE2B_OUTBYTES) { + TRY(_cryptonite_blake2b_init(&blake_state, outlen)); + TRY(_cryptonite_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(_cryptonite_blake2b_update(&blake_state, in, inlen)); + TRY(_cryptonite_blake2b_final(&blake_state, out, outlen)); + } else { + uint32_t toproduce; + uint8_t out_buffer[BLAKE2B_OUTBYTES]; + uint8_t in_buffer[BLAKE2B_OUTBYTES]; + TRY(_cryptonite_blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); + TRY(_cryptonite_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(_cryptonite_blake2b_update(&blake_state, in, inlen)); + TRY(_cryptonite_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2; + + while (toproduce > BLAKE2B_OUTBYTES) { + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(_cryptonite_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, + BLAKE2B_OUTBYTES, NULL, 0)); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce -= BLAKE2B_OUTBYTES / 2; + } + + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(_cryptonite_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL, + 0)); + memcpy(out, out_buffer, toproduce); + } +fail: + clear_internal_memory(&blake_state, sizeof(blake_state)); + return ret; +#undef TRY +} +/* Argon2 Team - End Code */ + +/***************Instance and Position constructors**********/ +static void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } + +static void copy_block(block *dst, const block *src) { + memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); +} + +static void xor_block(block *dst, const block *src) { + int i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] ^= src->v[i]; + } +} + +static void load_block(block *dst, const void *input) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i])); + } +} + +static void store_block(void *output, const block *src) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]); + } +} + +/***************Memory functions*****************/ +static +int allocate_memory(const argon2_context *context, uint8_t **memory, + size_t num, size_t size) { + size_t memory_size = num*size; + if (memory == NULL) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + /* 1. Check for multiplication overflow */ + if (size != 0 && memory_size / size != num) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + /* 2. Try to allocate with appropriate allocator */ + if (context->allocate_cbk) { + (context->allocate_cbk)(memory, memory_size); + } else { + *memory = malloc(memory_size); + } + + if (*memory == NULL) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + return ARGON2_OK; +} +static +void free_memory(const argon2_context *context, uint8_t *memory, + size_t num, size_t size) { + size_t memory_size = num*size; + clear_internal_memory(memory, memory_size); + if (context->free_cbk) { + (context->free_cbk)(memory, memory_size); + } else { + free(memory); + } +} + +void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) { +#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER) + SecureZeroMemory(v, n); +#elif defined memset_s + memset_s(v, n, 0, n); +#elif defined(__OpenBSD__) + explicit_bzero(v, n); +#else + static void *(*const volatile memset_sec)(void *, int, size_t) = &memset; + memset_sec(v, 0, n); +#endif +} + +/* Memory clear flag defaults to true. */ +static int FLAG_clear_internal_memory = 1; +static void clear_internal_memory(void *v, size_t n) { + if (FLAG_clear_internal_memory && v) { + secure_wipe_memory(v, n); + } +} + +static void finalize(const argon2_context *context, argon2_instance_t *instance) { + if (context != NULL && instance != NULL) { + block blockhash; + uint32_t l; + + copy_block(&blockhash, instance->memory + instance->lane_length - 1); + + /* XOR the last blocks */ + for (l = 1; l < instance->lanes; ++l) { + uint32_t last_block_in_lane = + l * instance->lane_length + (instance->lane_length - 1); + xor_block(&blockhash, instance->memory + last_block_in_lane); + } + + /* Hash the result */ + { + uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; + store_block(blockhash_bytes, &blockhash); + blake2b_long(context->out, context->outlen, blockhash_bytes, + ARGON2_BLOCK_SIZE); + /* clear blockhash and blockhash_bytes */ + clear_internal_memory(blockhash.v, ARGON2_BLOCK_SIZE); + clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); + } + +#ifdef GENKAT + print_tag(context->out, context->outlen); +#endif + + free_memory(context, (uint8_t *)instance->memory, + instance->memory_blocks, sizeof(block)); + } +} + +static uint32_t index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane) { + /* + * Pass 0: + * This lane : all already finished segments plus already constructed + * blocks in this segment + * Other lanes : all already finished segments + * Pass 1+: + * This lane : (SYNC_POINTS - 1) last segments plus already constructed + * blocks in this segment + * Other lanes : (SYNC_POINTS - 1) last segments + */ + uint32_t reference_area_size; + uint64_t relative_position; + uint32_t start_position, absolute_position; + + if (0 == position->pass) { + /* First pass */ + if (0 == position->slice) { + /* First slice */ + reference_area_size = + position->index - 1; /* all but the previous */ + } else { + if (same_lane) { + /* The same lane => add current segment */ + reference_area_size = + position->slice * instance->segment_length + + position->index - 1; + } else { + reference_area_size = + position->slice * instance->segment_length + + ((position->index == 0) ? (-1) : 0); + } + } + } else { + /* Second pass */ + if (same_lane) { + reference_area_size = instance->lane_length - + instance->segment_length + position->index - + 1; + } else { + reference_area_size = instance->lane_length - + instance->segment_length + + ((position->index == 0) ? (-1) : 0); + } + } + + /* 1.2.4. Mapping pseudo_rand to 0.. and produce + * relative position */ + relative_position = pseudo_rand; + relative_position = relative_position * relative_position >> 32; + relative_position = reference_area_size - 1 - + (reference_area_size * relative_position >> 32); + + /* 1.2.5 Computing starting position */ + start_position = 0; + + if (0 != position->pass) { + start_position = (position->slice == ARGON2_SYNC_POINTS - 1) + ? 0 + : (position->slice + 1) * instance->segment_length; + } + + /* 1.2.6. Computing absolute position */ + absolute_position = (start_position + relative_position) % + instance->lane_length; /* absolute position */ + return absolute_position; +} + +/* Single-threaded version for p=1 case */ +static int fill_memory_blocks_st(argon2_instance_t *instance) { + uint32_t r, s, l; + + for (r = 0; r < instance->passes; ++r) { + for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { + for (l = 0; l < instance->lanes; ++l) { + argon2_position_t position = {r, l, (uint8_t)s, 0}; + fill_segment(instance, position); + } + } +#ifdef GENKAT + internal_kat(instance, r); /* Print all memory blocks */ +#endif + } + return ARGON2_OK; +} + +#if !defined(ARGON2_NO_THREADS) + +#ifdef _WIN32 +static unsigned __stdcall fill_segment_thr(void *thread_data) +#else +static void *fill_segment_thr(void *thread_data) +#endif +{ + argon2_thread_data *my_data = thread_data; + fill_segment(my_data->instance_ptr, my_data->pos); + argon2_thread_exit(); + return 0; +} + +/* Multi-threaded version for p > 1 case */ +static int fill_memory_blocks_mt(argon2_instance_t *instance) { + uint32_t r, s; + argon2_thread_handle_t *thread = NULL; + argon2_thread_data *thr_data = NULL; + int rc = ARGON2_OK; + + /* 1. Allocating space for threads */ + thread = calloc(instance->lanes, sizeof(argon2_thread_handle_t)); + if (thread == NULL) { + rc = ARGON2_MEMORY_ALLOCATION_ERROR; + goto fail; + } + + thr_data = calloc(instance->lanes, sizeof(argon2_thread_data)); + if (thr_data == NULL) { + rc = ARGON2_MEMORY_ALLOCATION_ERROR; + goto fail; + } + + for (r = 0; r < instance->passes; ++r) { + for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { + uint32_t l; + + /* 2. Calling threads */ + for (l = 0; l < instance->lanes; ++l) { + argon2_position_t position; + + /* 2.1 Join a thread if limit is exceeded */ + if (l >= instance->threads) { + if (argon2_thread_join(thread[l - instance->threads])) { + rc = ARGON2_THREAD_FAIL; + goto fail; + } + } + + /* 2.2 Create thread */ + position.pass = r; + position.lane = l; + position.slice = (uint8_t)s; + position.index = 0; + thr_data[l].instance_ptr = + instance; /* preparing the thread input */ + memcpy(&(thr_data[l].pos), &position, + sizeof(argon2_position_t)); + if (argon2_thread_create(&thread[l], &fill_segment_thr, + (void *)&thr_data[l])) { + rc = ARGON2_THREAD_FAIL; + goto fail; + } + + /* fill_segment(instance, position); */ + /*Non-thread equivalent of the lines above */ + } + + /* 3. Joining remaining threads */ + for (l = instance->lanes - instance->threads; l < instance->lanes; + ++l) { + if (argon2_thread_join(thread[l])) { + rc = ARGON2_THREAD_FAIL; + goto fail; + } + } + } + +#ifdef GENKAT + internal_kat(instance, r); /* Print all memory blocks */ +#endif + } + +fail: + if (thread != NULL) { + free(thread); + } + if (thr_data != NULL) { + free(thr_data); + } + return rc; +} + +#endif /* ARGON2_NO_THREADS */ +static +int fill_memory_blocks(argon2_instance_t *instance) { + if (instance == NULL || instance->lanes == 0) { + return ARGON2_INCORRECT_PARAMETER; + } +#if defined(ARGON2_NO_THREADS) + return fill_memory_blocks_st(instance); +#else + return instance->threads == 1 ? + fill_memory_blocks_st(instance) : fill_memory_blocks_mt(instance); +#endif +} +static +int validate_inputs(const argon2_context *context) { + if (NULL == context) { + return ARGON2_INCORRECT_PARAMETER; + } + + if (NULL == context->out) { + return ARGON2_OUTPUT_PTR_NULL; + } + + /* Validate output length */ + if (ARGON2_MIN_OUTLEN > context->outlen) { + return ARGON2_OUTPUT_TOO_SHORT; + } + + if (ARGON2_MAX_OUTLEN < context->outlen) { + return ARGON2_OUTPUT_TOO_LONG; + } + + /* Validate password (required param) */ + if (NULL == context->pwd) { + if (0 != context->pwdlen) { + return ARGON2_PWD_PTR_MISMATCH; + } + } + + if (ARGON2_MIN_PWD_LENGTH > context->pwdlen) { + return ARGON2_PWD_TOO_SHORT; + } + + if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) { + return ARGON2_PWD_TOO_LONG; + } + + /* Validate salt (required param) */ + if (NULL == context->salt) { + if (0 != context->saltlen) { + return ARGON2_SALT_PTR_MISMATCH; + } + } + + if (ARGON2_MIN_SALT_LENGTH > context->saltlen) { + return ARGON2_SALT_TOO_SHORT; + } + + if (ARGON2_MAX_SALT_LENGTH < context->saltlen) { + return ARGON2_SALT_TOO_LONG; + } + + /* Validate secret (optional param) */ + if (NULL == context->secret) { + if (0 != context->secretlen) { + return ARGON2_SECRET_PTR_MISMATCH; + } + } else { + if (ARGON2_MIN_SECRET > context->secretlen) { + return ARGON2_SECRET_TOO_SHORT; + } + if (ARGON2_MAX_SECRET < context->secretlen) { + return ARGON2_SECRET_TOO_LONG; + } + } + + /* Validate associated data (optional param) */ + if (NULL == context->ad) { + if (0 != context->adlen) { + return ARGON2_AD_PTR_MISMATCH; + } + } else { + if (ARGON2_MIN_AD_LENGTH > context->adlen) { + return ARGON2_AD_TOO_SHORT; + } + if (ARGON2_MAX_AD_LENGTH < context->adlen) { + return ARGON2_AD_TOO_LONG; + } + } + + /* Validate memory cost */ + if (ARGON2_MIN_MEMORY > context->m_cost) { + return ARGON2_MEMORY_TOO_LITTLE; + } + + if (ARGON2_MAX_MEMORY < context->m_cost) { + return ARGON2_MEMORY_TOO_MUCH; + } + + if (context->m_cost < 8 * context->lanes) { + return ARGON2_MEMORY_TOO_LITTLE; + } + + /* Validate time cost */ + if (ARGON2_MIN_TIME > context->t_cost) { + return ARGON2_TIME_TOO_SMALL; + } + + if (ARGON2_MAX_TIME < context->t_cost) { + return ARGON2_TIME_TOO_LARGE; + } + + /* Validate lanes */ + if (ARGON2_MIN_LANES > context->lanes) { + return ARGON2_LANES_TOO_FEW; + } + + if (ARGON2_MAX_LANES < context->lanes) { + return ARGON2_LANES_TOO_MANY; + } + + /* Validate threads */ + if (ARGON2_MIN_THREADS > context->threads) { + return ARGON2_THREADS_TOO_FEW; + } + + if (ARGON2_MAX_THREADS < context->threads) { + return ARGON2_THREADS_TOO_MANY; + } + + if (NULL != context->allocate_cbk && NULL == context->free_cbk) { + return ARGON2_FREE_MEMORY_CBK_NULL; + } + + if (NULL == context->allocate_cbk && NULL != context->free_cbk) { + return ARGON2_ALLOCATE_MEMORY_CBK_NULL; + } + + return ARGON2_OK; +} +static +void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) { + uint32_t l; + /* Make the first and second block in each lane as G(H0||i||0) or + G(H0||i||1) */ + uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; + for (l = 0; l < instance->lanes; ++l) { + + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0); + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l); + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, + ARGON2_PREHASH_SEED_LENGTH); + load_block(&instance->memory[l * instance->lane_length + 0], + blockhash_bytes); + + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1); + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, + ARGON2_PREHASH_SEED_LENGTH); + load_block(&instance->memory[l * instance->lane_length + 1], + blockhash_bytes); + } + clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); +} +static +void initial_hash(uint8_t *blockhash, argon2_context *context, + argon2_type type) { + blake2b_state BlakeHash; + uint8_t value[sizeof(uint32_t)]; + + if (NULL == context || NULL == blockhash) { + return; + } + + _cryptonite_blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH); + + store32(&value, context->lanes); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->outlen); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->m_cost); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->t_cost); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->version); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, (uint32_t)type); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->pwdlen); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->pwd != NULL) { + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, + context->pwdlen); + + if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { + secure_wipe_memory(context->pwd, context->pwdlen); + context->pwdlen = 0; + } + } + + store32(&value, context->saltlen); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->salt != NULL) { + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->salt, + context->saltlen); + } + + store32(&value, context->secretlen); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->secret != NULL) { + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->secret, + context->secretlen); + + if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { + secure_wipe_memory(context->secret, context->secretlen); + context->secretlen = 0; + } + } + + store32(&value, context->adlen); + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->ad != NULL) { + _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->ad, + context->adlen); + } + + _cryptonite_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); +} +static +int initialize(argon2_instance_t *instance, argon2_context *context) { + uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; + int result = ARGON2_OK; + + if (instance == NULL || context == NULL) + return ARGON2_INCORRECT_PARAMETER; + instance->context_ptr = context; + + /* 1. Memory allocation */ + result = allocate_memory(context, (uint8_t **)&(instance->memory), + instance->memory_blocks, sizeof(block)); + if (result != ARGON2_OK) { + return result; + } + + /* 2. Initial hashing */ + /* H_0 + 8 extra bytes to produce the first blocks */ + /* uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; */ + /* Hashing all inputs */ + initial_hash(blockhash, context, instance->type); + /* Zeroing 8 extra bytes */ + clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, + ARGON2_PREHASH_SEED_LENGTH - + ARGON2_PREHASH_DIGEST_LENGTH); + +#ifdef GENKAT + initial_kat(blockhash, context, instance->type); +#endif + + /* 3. Creating first blocks, we always have at least two blocks in a slice + */ + fill_first_blocks(blockhash, instance); + /* Clearing the hash */ + clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH); + + return ARGON2_OK; +} diff --git a/bundled/cbits/argon2/core.h b/bundled/cbits/argon2/core.h new file mode 100644 index 0000000..888cc3c --- /dev/null +++ b/bundled/cbits/argon2/core.h @@ -0,0 +1,234 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef ARGON2_CORE_H +#define ARGON2_CORE_H + +#include "argon2.h" + +#if defined(_MSC_VER) +#define ALIGN(n) __declspec(align(16)) +#elif defined(__GNUC__) || defined(__clang) +#define ALIGN(x) __attribute__((__aligned__(x))) +#else +#define ALIGN(x) +#endif + +#define CONST_CAST(x) (x)(uintptr_t) + +/**********************Argon2 internal constants*******************************/ + +enum argon2_core_constants { + /* Memory block size in bytes */ + ARGON2_BLOCK_SIZE = 1024, + ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8, + ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16, + + /* Number of pseudo-random values generated by one call to Blake in Argon2i + to + generate reference block positions */ + ARGON2_ADDRESSES_IN_BLOCK = 128, + + /* Pre-hashing digest length and its extension*/ + ARGON2_PREHASH_DIGEST_LENGTH = 64, + ARGON2_PREHASH_SEED_LENGTH = 72 +}; + +/*************************Argon2 internal data types***********************/ + +/* + * Structure for the (1KB) memory block implemented as 128 64-bit words. + * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no + * bounds checking). + */ +typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block; + +/*****************Functions that work with the block******************/ + +/* Initialize each byte of the block with @in */ +static void init_block_value(block *b, uint8_t in); + +/* Copy block @src to block @dst */ +static void copy_block(block *dst, const block *src); + +/* XOR @src onto @dst bytewise */ +static void xor_block(block *dst, const block *src); + +/* + * Argon2 instance: memory pointer, number of passes, amount of memory, type, + * and derived values. + * Used to evaluate the number and location of blocks to construct in each + * thread + */ +typedef struct Argon2_instance_t { + block *memory; /* Memory pointer */ + uint32_t version; + uint32_t passes; /* Number of passes */ + uint32_t memory_blocks; /* Number of blocks in memory */ + uint32_t segment_length; + uint32_t lane_length; + uint32_t lanes; + uint32_t threads; + argon2_type type; + int print_internals; /* whether to print the memory blocks */ + argon2_context *context_ptr; /* points back to original context */ +} argon2_instance_t; + +/* + * Argon2 position: where we construct the block right now. Used to distribute + * work between threads. + */ +typedef struct Argon2_position_t { + uint32_t pass; + uint32_t lane; + uint8_t slice; + uint32_t index; +} argon2_position_t; + +/*Struct that holds the inputs for thread handling FillSegment*/ +typedef struct Argon2_thread_data { + argon2_instance_t *instance_ptr; + argon2_position_t pos; +} argon2_thread_data; + +/*************************Argon2 core functions********************************/ + +/* Allocates memory to the given pointer, uses the appropriate allocator as + * specified in the context. Total allocated memory is num*size. + * @param context argon2_context which specifies the allocator + * @param memory pointer to the pointer to the memory + * @param size the size in bytes for each element to be allocated + * @param num the number of elements to be allocated + * @return ARGON2_OK if @memory is a valid pointer and memory is allocated + */ +static int allocate_memory(const argon2_context *context, uint8_t **memory, + size_t num, size_t size); + +/* + * Frees memory at the given pointer, uses the appropriate deallocator as + * specified in the context. Also cleans the memory using clear_internal_memory. + * @param context argon2_context which specifies the deallocator + * @param memory pointer to buffer to be freed + * @param size the size in bytes for each element to be deallocated + * @param num the number of elements to be deallocated + */ +static void free_memory(const argon2_context *context, uint8_t *memory, + size_t num, size_t size); + +/* Function that securely cleans the memory. This ignores any flags set + * regarding clearing memory. Usually one just calls clear_internal_memory. + * @param mem Pointer to the memory + * @param s Memory size in bytes + */ +static void secure_wipe_memory(void *v, size_t n); + +/* Function that securely clears the memory if FLAG_clear_internal_memory is + * set. If the flag isn't set, this function does nothing. + * @param mem Pointer to the memory + * @param s Memory size in bytes + */ +static void clear_internal_memory(void *v, size_t n); + +/* + * Computes absolute position of reference block in the lane following a skewed + * distribution and using a pseudo-random value as input + * @param instance Pointer to the current instance + * @param position Pointer to the current position + * @param pseudo_rand 32-bit pseudo-random value used to determine the position + * @param same_lane Indicates if the block will be taken from the current lane. + * If so we can reference the current segment + * @pre All pointers must be valid + */ +static uint32_t index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane); + +/* + * Function that validates all inputs against predefined restrictions and return + * an error code + * @param context Pointer to current Argon2 context + * @return ARGON2_OK if everything is all right, otherwise one of error codes + * (all defined in + */ +static int validate_inputs(const argon2_context *context); + +/* + * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears + * password and secret if needed + * @param context Pointer to the Argon2 internal structure containing memory + * pointer, and parameters for time and space requirements. + * @param blockhash Buffer for pre-hashing digest + * @param type Argon2 type + * @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes + * allocated + */ +static void initial_hash(uint8_t *blockhash, argon2_context *context, + argon2_type type); + +/* + * Function creates first 2 blocks per lane + * @param instance Pointer to the current instance + * @param blockhash Pointer to the pre-hashing digest + * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values + */ +static void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance); + +/* + * Function allocates memory, hashes the inputs with Blake, and creates first + * two blocks. Returns the pointer to the main memory with 2 blocks per lane + * initialized + * @param context Pointer to the Argon2 internal structure containing memory + * pointer, and parameters for time and space requirements. + * @param instance Current Argon2 instance + * @return Zero if successful, -1 if memory failed to allocate. @context->state + * will be modified if successful. + */ +static int initialize(argon2_instance_t *instance, argon2_context *context); + +/* + * XORing the last block of each lane, hashing it, making the tag. Deallocates + * the memory. + * @param context Pointer to current Argon2 context (use only the out parameters + * from it) + * @param instance Pointer to current instance of Argon2 + * @pre instance->state must point to necessary amount of memory + * @pre context->out must point to outlen bytes of memory + * @pre if context->free_cbk is not NULL, it should point to a function that + * deallocates memory + */ +static void finalize(const argon2_context *context, argon2_instance_t *instance); + +/* + * Function that fills the segment using previous segments also from other + * threads + * @param context current context + * @param instance Pointer to the current instance + * @param position Current position + * @pre all block pointers must be valid + */ +static void fill_segment(const argon2_instance_t *instance, + argon2_position_t position); + +/* + * Function that fills the entire memory t_cost times based on the first two + * blocks in each lane + * @param instance Pointer to the current instance + * @return ARGON2_OK if successful, @context->state + */ +static int fill_memory_blocks(argon2_instance_t *instance); + +#endif diff --git a/bundled/cbits/argon2/opt.c b/bundled/cbits/argon2/opt.c new file mode 100644 index 0000000..0102bdc --- /dev/null +++ b/bundled/cbits/argon2/opt.c @@ -0,0 +1,186 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#include +#include +#include + +#include "argon2.h" +#include "opt.h" + +#include "blake2/blake2.h" +#include "blake2/blamka-round-opt.h" + +static void fill_block(__m128i *state, const block *ref_block, block *next_block, + int with_xor) { + __m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; + unsigned int i; + + if (with_xor) { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); + block_XY[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i *)next_block->v + i)); + } + } else { + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + block_XY[i] = state[i] = _mm_xor_si128( + state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); + } + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], + state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], + state[8 * i + 6], state[8 * i + 7]); + } + + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], + state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], + state[8 * 6 + i], state[8 * 7 + i]); + } + + for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { + state[i] = _mm_xor_si128(state[i], block_XY[i]); + _mm_storeu_si128((__m128i *)next_block->v + i, state[i]); + } +} + +static void next_addresses(block *address_block, block *input_block) { + /*Temporary zero-initialized blocks*/ + __m128i zero_block[ARGON2_OWORDS_IN_BLOCK]; + __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK]; + + memset(zero_block, 0, sizeof(zero_block)); + memset(zero2_block, 0, sizeof(zero2_block)); + + /*Increasing index counter*/ + input_block->v[6]++; + + /*First iteration of G*/ + fill_block(zero_block, input_block, address_block, 0); + + /*Second iteration of G*/ + fill_block(zero2_block, address_block, address_block, 0); +} + +static void fill_segment(const argon2_instance_t *instance, + argon2_position_t position) { + block *ref_block = NULL, *curr_block = NULL; + block address_block, input_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index, i; + __m128i state[64]; + int data_independent_addressing; + + if (instance == NULL) { + return; + } + + data_independent_addressing = + (instance->type == Argon2_i) || + (instance->type == Argon2_id && (position.pass == 0) && + (position.slice < ARGON2_SYNC_POINTS / 2)); + + if (data_independent_addressing) { + init_block_value(&input_block, 0); + + input_block.v[0] = position.pass; + input_block.v[1] = position.lane; + input_block.v[2] = position.slice; + input_block.v[3] = instance->memory_blocks; + input_block.v[4] = instance->passes; + input_block.v[5] = instance->type; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + + /* Don't forget to generate the first block of addresses: */ + if (data_independent_addressing) { + next_addresses(&address_block, &input_block); + } + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + if (data_independent_addressing) { + if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { + next_addresses(&address_block, &input_block); + } + pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; + } else { + pseudo_rand = instance->memory[prev_offset].v[0]; + } + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(state, ref_block, curr_block, 0); + } else { + if(0 == position.pass) { + fill_block(state, ref_block, curr_block, 0); + } else { + fill_block(state, ref_block, curr_block, 1); + } + } + } +} diff --git a/bundled/cbits/argon2/opt.h b/bundled/cbits/argon2/opt.h new file mode 100644 index 0000000..825bbfc --- /dev/null +++ b/bundled/cbits/argon2/opt.h @@ -0,0 +1,35 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef ARGON2_OPT_H +#define ARGON2_OPT_H + +#include "core.h" +#include + +/* + * Function fills a new memory block and optionally XORs the old block over the new one. + * Memory must be initialized. + * @param state Pointer to the just produced block. Content will be updated(!) + * @param ref_block Pointer to the reference block + * @param next_block Pointer to the block to be XORed over. May coincide with @ref_block + * @param with_xor Whether to XOR into the new block (1) or just overwrite (0) + * @pre all block pointers must be valid + */ +static void fill_block(__m128i *s, const block *ref_block, block *next_block, int with_xor); + +#endif /* ARGON2_OPT_H */ diff --git a/bundled/cbits/argon2/ref.c b/bundled/cbits/argon2/ref.c new file mode 100644 index 0000000..f30e294 --- /dev/null +++ b/bundled/cbits/argon2/ref.c @@ -0,0 +1,185 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#include +#include +#include + +#include "argon2.h" +#include "ref.h" + +#include "blamka-round-ref.h" +#include "blake2-impl.h" +#include "blake2.h" + + +static void fill_block(const block *prev_block, const block *ref_block, + block *next_block, int with_xor) { + block blockR, block_tmp; + unsigned i; + + copy_block(&blockR, ref_block); + xor_block(&blockR, prev_block); + copy_block(&block_tmp, &blockR); + /* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */ + if (with_xor) { + /* Saving the next block contents for XOR over: */ + xor_block(&block_tmp, next_block); + /* Now blockR = ref_block + prev_block and + block_tmp = ref_block + prev_block + next_block */ + } + + /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then + (16,17,..31)... finally (112,113,...127) */ + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND_NOMSG( + blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2], + blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5], + blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8], + blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11], + blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14], + blockR.v[16 * i + 15]); + } + + /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then + (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */ + for (i = 0; i < 8; i++) { + BLAKE2_ROUND_NOMSG( + blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16], + blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33], + blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64], + blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81], + blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112], + blockR.v[2 * i + 113]); + } + + copy_block(next_block, &block_tmp); + xor_block(next_block, &blockR); +} + +static void next_addresses(block *address_block, block *input_block, + const block *zero_block) { + input_block->v[6]++; + fill_block(zero_block, input_block, address_block, 0); + fill_block(zero_block, address_block, address_block, 0); +} + +static void fill_segment(const argon2_instance_t *instance, + argon2_position_t position) { + block *ref_block = NULL, *curr_block = NULL; + block address_block, input_block, zero_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index; + uint32_t i; + int data_independent_addressing; + + if (instance == NULL) { + return; + } + + data_independent_addressing = + (instance->type == Argon2_i) || + (instance->type == Argon2_id && (position.pass == 0) && + (position.slice < ARGON2_SYNC_POINTS / 2)); + + if (data_independent_addressing) { + init_block_value(&zero_block, 0); + init_block_value(&input_block, 0); + + input_block.v[0] = position.pass; + input_block.v[1] = position.lane; + input_block.v[2] = position.slice; + input_block.v[3] = instance->memory_blocks; + input_block.v[4] = instance->passes; + input_block.v[5] = instance->type; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + + /* Don't forget to generate the first block of addresses: */ + if (data_independent_addressing) { + next_addresses(&address_block, &input_block, &zero_block); + } + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + if (data_independent_addressing) { + if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { + next_addresses(&address_block, &input_block, &zero_block); + } + pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; + } else { + pseudo_rand = instance->memory[prev_offset].v[0]; + } + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(instance->memory + prev_offset, ref_block, curr_block, 0); + } else { + if(0 == position.pass) { + fill_block(instance->memory + prev_offset, ref_block, + curr_block, 0); + } else { + fill_block(instance->memory + prev_offset, ref_block, + curr_block, 1); + } + } + } +} diff --git a/bundled/cbits/argon2/ref.h b/bundled/cbits/argon2/ref.h new file mode 100644 index 0000000..676b454 --- /dev/null +++ b/bundled/cbits/argon2/ref.h @@ -0,0 +1,35 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef ARGON2_REF_H +#define ARGON2_REF_H + +#include "core.h" + +/* + * Function fills a new memory block and optionally XORs the old block over the new one. + * @next_block must be initialized. + * @param prev_block Pointer to the previous block + * @param ref_block Pointer to the reference block + * @param next_block Pointer to the block to be constructed + * @param with_xor Whether to XOR into the new block (1) or just overwrite (0) + * @pre all block pointers must be valid + */ +static void fill_block(const block *prev_block, const block *ref_block, + block *next_block, int with_xor); + +#endif /* ARGON2_REF_H */ diff --git a/bundled/cbits/argon2/thread.c b/bundled/cbits/argon2/thread.c new file mode 100644 index 0000000..1686481 --- /dev/null +++ b/bundled/cbits/argon2/thread.c @@ -0,0 +1,57 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#if !defined(ARGON2_NO_THREADS) + +#include "thread.h" +#if defined(_WIN32) +#include +#endif + +static int argon2_thread_create(argon2_thread_handle_t *handle, + argon2_thread_func_t func, void *args) { + if (NULL == handle || func == NULL) { + return -1; + } +#if defined(_WIN32) + *handle = _beginthreadex(NULL, 0, func, args, 0, NULL); + return *handle != 0 ? 0 : -1; +#else + return pthread_create(handle, NULL, func, args); +#endif +} + +static int argon2_thread_join(argon2_thread_handle_t handle) { +#if defined(_WIN32) + if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) { + return CloseHandle((HANDLE)handle) != 0 ? 0 : -1; + } + return -1; +#else + return pthread_join(handle, NULL); +#endif +} + +static void argon2_thread_exit(void) { +#if defined(_WIN32) + _endthreadex(0); +#else + pthread_exit(NULL); +#endif +} + +#endif /* ARGON2_NO_THREADS */ diff --git a/bundled/cbits/argon2/thread.h b/bundled/cbits/argon2/thread.h new file mode 100644 index 0000000..d91882c --- /dev/null +++ b/bundled/cbits/argon2/thread.h @@ -0,0 +1,67 @@ +/* + * Argon2 reference source code package - reference C implementations + * + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves + * + * You may use this work under the terms of a Creative Commons CC0 1.0 + * License/Waiver or the Apache Public License 2.0, at your option. The terms of + * these licenses can be found at: + * + * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + * + * You should have received a copy of both of these licenses along with this + * software. If not, they may be obtained at the above URLs. + */ + +#ifndef ARGON2_THREAD_H +#define ARGON2_THREAD_H + +#if !defined(ARGON2_NO_THREADS) + +/* + Here we implement an abstraction layer for the simpĺe requirements + of the Argon2 code. We only require 3 primitives---thread creation, + joining, and termination---so full emulation of the pthreads API + is unwarranted. Currently we wrap pthreads and Win32 threads. + + The API defines 2 types: the function pointer type, + argon2_thread_func_t, + and the type of the thread handle---argon2_thread_handle_t. +*/ +#if defined(_WIN32) +#include +typedef unsigned(__stdcall *argon2_thread_func_t)(void *); +typedef uintptr_t argon2_thread_handle_t; +#else +#include +typedef void *(*argon2_thread_func_t)(void *); +typedef pthread_t argon2_thread_handle_t; +#endif + +/* Creates a thread + * @param handle pointer to a thread handle, which is the output of this + * function. Must not be NULL. + * @param func A function pointer for the thread's entry point. Must not be + * NULL. + * @param args Pointer that is passed as an argument to @func. May be NULL. + * @return 0 if @handle and @func are valid pointers and a thread is successfuly + * created. + */ +static int argon2_thread_create(argon2_thread_handle_t *handle, + argon2_thread_func_t func, void *args); + +/* Waits for a thread to terminate + * @param handle Handle to a thread created with argon2_thread_create. + * @return 0 if @handle is a valid handle, and joining completed successfully. +*/ +static int argon2_thread_join(argon2_thread_handle_t handle); + +/* Terminate the current thread. Must be run inside a thread created by + * argon2_thread_create. +*/ +static void argon2_thread_exit(void); + +#endif /* ARGON2_NO_THREADS */ +#endif diff --git a/bundled/cbits/blake2/sse/blake2-config.h b/bundled/cbits/blake2/sse/blake2-config.h new file mode 100644 index 0000000..a524aa9 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2-config.h @@ -0,0 +1,72 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2_CONFIG_H +#define BLAKE2_CONFIG_H + +/* These don't work everywhere */ +#if defined(__SSE2__) || defined(__x86_64__) || defined(__amd64__) +#define HAVE_SSE2 +#endif + +#if defined(__SSSE3__) +#define HAVE_SSSE3 +#endif + +#if defined(__SSE4_1__) +#define HAVE_SSE41 +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#endif + +#if defined(__XOP__) +#define HAVE_XOP +#endif + + +#ifdef HAVE_AVX2 +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_XOP +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_AVX +#ifndef HAVE_SSE41 +#define HAVE_SSE41 +#endif +#endif + +#ifdef HAVE_SSE41 +#ifndef HAVE_SSSE3 +#define HAVE_SSSE3 +#endif +#endif + +#ifdef HAVE_SSSE3 +#define HAVE_SSE2 +#endif + +#if !defined(HAVE_SSE2) +#error "This code requires at least SSE2." +#endif + +#endif diff --git a/bundled/cbits/blake2/sse/blake2-impl.h b/bundled/cbits/blake2/sse/blake2-impl.h new file mode 100644 index 0000000..c1df82e --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2-impl.h @@ -0,0 +1,160 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2_IMPL_H +#define BLAKE2_IMPL_H + +#include +#include + +#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L) + #if defined(_MSC_VER) + #define BLAKE2_INLINE __inline + #elif defined(__GNUC__) + #define BLAKE2_INLINE __inline__ + #else + #define BLAKE2_INLINE + #endif +#else + #define BLAKE2_INLINE inline +#endif + +static BLAKE2_INLINE uint32_t load32( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + return (( uint32_t )( p[0] ) << 0) | + (( uint32_t )( p[1] ) << 8) | + (( uint32_t )( p[2] ) << 16) | + (( uint32_t )( p[3] ) << 24) ; +#endif +} + +static BLAKE2_INLINE uint64_t load64( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + return (( uint64_t )( p[0] ) << 0) | + (( uint64_t )( p[1] ) << 8) | + (( uint64_t )( p[2] ) << 16) | + (( uint64_t )( p[3] ) << 24) | + (( uint64_t )( p[4] ) << 32) | + (( uint64_t )( p[5] ) << 40) | + (( uint64_t )( p[6] ) << 48) | + (( uint64_t )( p[7] ) << 56) ; +#endif +} + +static BLAKE2_INLINE uint16_t load16( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint16_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + return ( uint16_t )((( uint32_t )( p[0] ) << 0) | + (( uint32_t )( p[1] ) << 8)); +#endif +} + +static BLAKE2_INLINE void store16( void *dst, uint16_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +#endif +} + +static BLAKE2_INLINE void store32( void *dst, uint32_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +#endif +} + +static BLAKE2_INLINE void store64( void *dst, uint64_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); + p[4] = (uint8_t)(w >> 32); + p[5] = (uint8_t)(w >> 40); + p[6] = (uint8_t)(w >> 48); + p[7] = (uint8_t)(w >> 56); +#endif +} + +static BLAKE2_INLINE uint64_t load48( const void *src ) +{ + const uint8_t *p = ( const uint8_t * )src; + return (( uint64_t )( p[0] ) << 0) | + (( uint64_t )( p[1] ) << 8) | + (( uint64_t )( p[2] ) << 16) | + (( uint64_t )( p[3] ) << 24) | + (( uint64_t )( p[4] ) << 32) | + (( uint64_t )( p[5] ) << 40) ; +} + +static BLAKE2_INLINE void store48( void *dst, uint64_t w ) +{ + uint8_t *p = ( uint8_t * )dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); + p[4] = (uint8_t)(w >> 32); + p[5] = (uint8_t)(w >> 40); +} + +static BLAKE2_INLINE uint32_t rotr32( const uint32_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 32 - c ) ); +} + +static BLAKE2_INLINE uint64_t rotr64( const uint64_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/* prevents compiler optimizing out memset() */ +static BLAKE2_INLINE void secure_zero_memory(void *v, size_t n) +{ + static void *(*const volatile memset_v)(void *, int, size_t) = &memset; + memset_v(v, 0, n); +} + +#endif diff --git a/bundled/cbits/blake2/sse/blake2.h b/bundled/cbits/blake2/sse/blake2.h new file mode 100644 index 0000000..d69b087 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2.h @@ -0,0 +1,195 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2_H +#define BLAKE2_H + +#include +#include + +#if defined(_MSC_VER) +#define BLAKE2_PACKED(x) __pragma(pack(push, 1)) x __pragma(pack(pop)) +#else +#define BLAKE2_PACKED(x) x __attribute__((packed)) +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + + enum blake2s_constant + { + BLAKE2S_BLOCKBYTES = 64, + BLAKE2S_OUTBYTES = 32, + BLAKE2S_KEYBYTES = 32, + BLAKE2S_SALTBYTES = 8, + BLAKE2S_PERSONALBYTES = 8 + }; + + enum blake2b_constant + { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 + }; + + typedef struct blake2s_state__ + { + uint32_t h[8]; + uint32_t t[2]; + uint32_t f[2]; + uint8_t buf[BLAKE2S_BLOCKBYTES]; + size_t buflen; + size_t outlen; + uint8_t last_node; + } blake2s_state; + + typedef struct blake2b_state__ + { + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + uint8_t buf[BLAKE2B_BLOCKBYTES]; + size_t buflen; + size_t outlen; + uint8_t last_node; + } blake2b_state; + + typedef struct blake2sp_state__ + { + blake2s_state S[8][1]; + blake2s_state R[1]; + uint8_t buf[8 * BLAKE2S_BLOCKBYTES]; + size_t buflen; + size_t outlen; + } blake2sp_state; + + typedef struct blake2bp_state__ + { + blake2b_state S[4][1]; + blake2b_state R[1]; + uint8_t buf[4 * BLAKE2B_BLOCKBYTES]; + size_t buflen; + size_t outlen; + } blake2bp_state; + + + BLAKE2_PACKED(struct blake2s_param__ + { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t leaf_length; /* 8 */ + uint32_t node_offset; /* 12 */ + uint16_t xof_length; /* 14 */ + uint8_t node_depth; /* 15 */ + uint8_t inner_length; /* 16 */ + /* uint8_t reserved[0]; */ + uint8_t salt[BLAKE2S_SALTBYTES]; /* 24 */ + uint8_t personal[BLAKE2S_PERSONALBYTES]; /* 32 */ + }); + + typedef struct blake2s_param__ blake2s_param; + + BLAKE2_PACKED(struct blake2b_param__ + { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t leaf_length; /* 8 */ + uint32_t node_offset; /* 12 */ + uint32_t xof_length; /* 16 */ + uint8_t node_depth; /* 17 */ + uint8_t inner_length; /* 18 */ + uint8_t reserved[14]; /* 32 */ + uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ + uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ + }); + + typedef struct blake2b_param__ blake2b_param; + + typedef struct blake2xs_state__ + { + blake2s_state S[1]; + blake2s_param P[1]; + } blake2xs_state; + + typedef struct blake2xb_state__ + { + blake2b_state S[1]; + blake2b_param P[1]; + } blake2xb_state; + + /* Padded structs result in a compile-time error */ + enum { + BLAKE2_DUMMY_1 = 1/(sizeof(blake2s_param) == BLAKE2S_OUTBYTES), + BLAKE2_DUMMY_2 = 1/(sizeof(blake2b_param) == BLAKE2B_OUTBYTES) + }; + + /* Streaming API */ + int _cryptonite_blake2s_init( blake2s_state *S, size_t outlen ); + int _cryptonite_blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen ); + int _cryptonite_blake2s_init_param( blake2s_state *S, const blake2s_param *P ); + int _cryptonite_blake2s_update( blake2s_state *S, const void *in, size_t inlen ); + int _cryptonite_blake2s_final( blake2s_state *S, void *out, size_t outlen ); + + int _cryptonite_blake2b_init( blake2b_state *S, size_t outlen ); + int _cryptonite_blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ); + int _cryptonite_blake2b_init_param( blake2b_state *S, const blake2b_param *P ); + int _cryptonite_blake2b_update( blake2b_state *S, const void *in, size_t inlen ); + int _cryptonite_blake2b_final( blake2b_state *S, void *out, size_t outlen ); + + int _cryptonite_blake2sp_init( blake2sp_state *S, size_t outlen ); + int _cryptonite_blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen ); + int _cryptonite_blake2sp_update( blake2sp_state *S, const void *in, size_t inlen ); + int _cryptonite_blake2sp_final( blake2sp_state *S, void *out, size_t outlen ); + + int _cryptonite_blake2bp_init( blake2bp_state *S, size_t outlen ); + int _cryptonite_blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen ); + int _cryptonite_blake2bp_update( blake2bp_state *S, const void *in, size_t inlen ); + int _cryptonite_blake2bp_final( blake2bp_state *S, void *out, size_t outlen ); + + /* Variable output length API */ + int _cryptonite_blake2xs_init( blake2xs_state *S, const size_t outlen ); + int _cryptonite_blake2xs_init_key( blake2xs_state *S, const size_t outlen, const void *key, size_t keylen ); + int _cryptonite_blake2xs_update( blake2xs_state *S, const void *in, size_t inlen ); + int _cryptonite_blake2xs_final(blake2xs_state *S, void *out, size_t outlen); + + int _cryptonite_blake2xb_init( blake2xb_state *S, const size_t outlen ); + int _cryptonite_blake2xb_init_key( blake2xb_state *S, const size_t outlen, const void *key, size_t keylen ); + int _cryptonite_blake2xb_update( blake2xb_state *S, const void *in, size_t inlen ); + int _cryptonite_blake2xb_final(blake2xb_state *S, void *out, size_t outlen); + + /* Simple API */ + int _cryptonite_blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); + int _cryptonite_blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); + + int _cryptonite_blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); + int _cryptonite_blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); + + int _cryptonite_blake2xs( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); + int _cryptonite_blake2xb( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); + + /* This is simply an alias for blake2b */ + int _cryptonite_blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/bundled/cbits/blake2/sse/blake2b-load-sse2.h b/bundled/cbits/blake2/sse/blake2b-load-sse2.h new file mode 100644 index 0000000..23a8d40 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2b-load-sse2.h @@ -0,0 +1,68 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2B_LOAD_SSE2_H +#define BLAKE2B_LOAD_SSE2_H + +#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) +#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5) +#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2) +#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7) +#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1) +#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13) +#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2) +#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6) +#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8) +#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11) +#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15) +#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14) +#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14) +#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13) +#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9) +#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2) +#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12) +#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1) +#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8) +#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6) +#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11) +#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3) +#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1) +#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4) +#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7) +#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6) +#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3) +#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12) +#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) + + +#endif diff --git a/bundled/cbits/blake2/sse/blake2b-load-sse41.h b/bundled/cbits/blake2/sse/blake2b-load-sse41.h new file mode 100644 index 0000000..0eca865 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2b-load-sse41.h @@ -0,0 +1,402 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2B_LOAD_SSE41_H +#define BLAKE2B_LOAD_SSE41_H + +#define LOAD_MSG_0_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_0_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_1_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_1_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_1_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_1_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#define LOAD_MSG_2_1(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m5, 8); \ +b1 = _mm_unpackhi_epi64(m2, m7); \ +} while(0) + + +#define LOAD_MSG_2_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m0); \ +b1 = _mm_blend_epi16(m1, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_2_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m5, m1, 0xF0); \ +b1 = _mm_unpackhi_epi64(m3, m4); \ +} while(0) + + +#define LOAD_MSG_2_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m3); \ +b1 = _mm_alignr_epi8(m2, m0, 8); \ +} while(0) + + +#define LOAD_MSG_3_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_unpackhi_epi64(m6, m5); \ +} while(0) + + +#define LOAD_MSG_3_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m0); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_3_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m2, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_3_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m5); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_4_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m2); \ +b1 = _mm_unpacklo_epi64(m1, m5); \ +} while(0) + + +#define LOAD_MSG_4_2(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m0, m3, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m7, m5, 0xF0); \ +b1 = _mm_blend_epi16(m3, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m0, 8); \ +b1 = _mm_blend_epi16(m4, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_5_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m3); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_5_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m5); \ +b1 = _mm_unpackhi_epi64(m5, m1); \ +} while(0) + + +#define LOAD_MSG_5_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m2, m3, 0xF0); \ +b1 = _mm_unpackhi_epi64(m7, m0); \ +} while(0) + + +#define LOAD_MSG_5_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m2); \ +b1 = _mm_blend_epi16(m7, m4, 0xF0); \ +} while(0) + + +#define LOAD_MSG_6_1(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m6, m0, 0xF0); \ +b1 = _mm_unpacklo_epi64(m7, m2); \ +} while(0) + + +#define LOAD_MSG_6_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_alignr_epi8(m5, m6, 8); \ +} while(0) + + +#define LOAD_MSG_6_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m3); \ +b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ +} while(0) + + +#define LOAD_MSG_6_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_blend_epi16(m1, m5, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m3); \ +b1 = _mm_blend_epi16(m6, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_2(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpackhi_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_7_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_unpacklo_epi64(m4, m1); \ +} while(0) + + +#define LOAD_MSG_7_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m2); \ +b1 = _mm_unpacklo_epi64(m3, m5); \ +} while(0) + + +#define LOAD_MSG_8_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m7); \ +b1 = _mm_alignr_epi8(m0, m5, 8); \ +} while(0) + + +#define LOAD_MSG_8_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_alignr_epi8(m4, m1, 8); \ +} while(0) + + +#define LOAD_MSG_8_3(b0, b1) \ +do \ +{ \ +b0 = m6; \ +b1 = _mm_alignr_epi8(m5, m0, 8); \ +} while(0) + + +#define LOAD_MSG_8_4(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m3, 0xF0); \ +b1 = m2; \ +} while(0) + + +#define LOAD_MSG_9_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_unpackhi_epi64(m3, m0); \ +} while(0) + + +#define LOAD_MSG_9_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m2); \ +b1 = _mm_blend_epi16(m3, m2, 0xF0); \ +} while(0) + + +#define LOAD_MSG_9_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_unpackhi_epi64(m1, m6); \ +} while(0) + + +#define LOAD_MSG_9_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpacklo_epi64(m6, m0); \ +} while(0) + + +#define LOAD_MSG_10_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_10_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_11_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_11_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_11_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_11_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#endif diff --git a/bundled/cbits/blake2/sse/blake2b-round.h b/bundled/cbits/blake2/sse/blake2b-round.h new file mode 100644 index 0000000..6537fff --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2b-round.h @@ -0,0 +1,157 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2B_ROUND_H +#define BLAKE2B_ROUND_H + +#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +#define LIKELY(x) __builtin_expect((x),1) + + +/* Microarchitecture-specific macros */ +#ifndef HAVE_XOP +#ifdef HAVE_SSSE3 +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) +#else +#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) )) +#endif +#else +/* ... */ +#endif + + + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); \ + +#if defined(HAVE_SSSE3) +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; +#else + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row4l;\ + t1 = row2l;\ + row4l = row3l;\ + row3l = row3h;\ + row3h = row4l;\ + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row3l;\ + row3l = row3h;\ + row3h = t0;\ + t0 = row2l;\ + t1 = row4l;\ + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) + +#endif + +#if defined(HAVE_SSE41) +#include "blake2b-load-sse41.h" +#else +#include "blake2b-load-sse2.h" +#endif + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + +#endif diff --git a/bundled/cbits/blake2/sse/blake2b.c b/bundled/cbits/blake2/sse/blake2b.c new file mode 100644 index 0000000..23f4b25 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2b.c @@ -0,0 +1,373 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#include +#include +#include + +#include "blake2.h" +#include "blake2-impl.h" + +#include "blake2-config.h" + +#ifdef _MSC_VER +#include /* for _mm_set_epi64x */ +#endif +#include +#if defined(HAVE_SSSE3) +#include +#endif +#if defined(HAVE_SSE41) +#include +#endif +#if defined(HAVE_AVX) +#include +#endif +#if defined(HAVE_XOP) +#include +#endif + +#include "blake2b-round.h" + +static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/* Some helper functions */ +static void blake2b_set_lastnode( blake2b_state *S ) +{ + S->f[1] = (uint64_t)-1; +} + +static int blake2b_is_lastblock( const blake2b_state *S ) +{ + return S->f[0] != 0; +} + +static void blake2b_set_lastblock( blake2b_state *S ) +{ + if( S->last_node ) blake2b_set_lastnode( S ); + + S->f[0] = (uint64_t)-1; +} + +static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc ) +{ + S->t[0] += inc; + S->t[1] += ( S->t[0] < inc ); +} + +/* init xors IV with input parameter block */ +int _cryptonite_blake2b_init_param( blake2b_state *S, const blake2b_param *P ) +{ + size_t i; + /*blake2b_init0( S ); */ + const unsigned char * v = ( const unsigned char * )( blake2b_IV ); + const unsigned char * p = ( const unsigned char * )( P ); + unsigned char * h = ( unsigned char * )( S->h ); + /* IV XOR ParamBlock */ + memset( S, 0, sizeof( blake2b_state ) ); + + for( i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; + + S->outlen = P->digest_length; + return 0; +} + + +/* Some sort of default parameter block initialization, for sequential blake2b */ +int _cryptonite_blake2b_init( blake2b_state *S, size_t outlen ) +{ + blake2b_param P[1]; + + if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; + + P->digest_length = (uint8_t)outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store32( &P->node_offset, 0 ); + store32( &P->xof_length, 0 ); + P->node_depth = 0; + P->inner_length = 0; + memset( P->reserved, 0, sizeof( P->reserved ) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + + return _cryptonite_blake2b_init_param( S, P ); +} + +int _cryptonite_blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ) +{ + blake2b_param P[1]; + + if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; + + if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1; + + P->digest_length = (uint8_t)outlen; + P->key_length = (uint8_t)keylen; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store32( &P->node_offset, 0 ); + store32( &P->xof_length, 0 ); + P->node_depth = 0; + P->inner_length = 0; + memset( P->reserved, 0, sizeof( P->reserved ) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + + if( _cryptonite_blake2b_init_param( S, P ) < 0 ) + return 0; + + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset( block, 0, BLAKE2B_BLOCKBYTES ); + memcpy( block, key, keylen ); + _cryptonite_blake2b_update( S, block, BLAKE2B_BLOCKBYTES ); + secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */ + } + return 0; +} + +static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; +#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) + const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); + const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); +#endif +#if defined(HAVE_SSE41) + const __m128i m0 = LOADU( block + 00 ); + const __m128i m1 = LOADU( block + 16 ); + const __m128i m2 = LOADU( block + 32 ); + const __m128i m3 = LOADU( block + 48 ); + const __m128i m4 = LOADU( block + 64 ); + const __m128i m5 = LOADU( block + 80 ); + const __m128i m6 = LOADU( block + 96 ); + const __m128i m7 = LOADU( block + 112 ); +#else + const uint64_t m0 = load64(block + 0 * sizeof(uint64_t)); + const uint64_t m1 = load64(block + 1 * sizeof(uint64_t)); + const uint64_t m2 = load64(block + 2 * sizeof(uint64_t)); + const uint64_t m3 = load64(block + 3 * sizeof(uint64_t)); + const uint64_t m4 = load64(block + 4 * sizeof(uint64_t)); + const uint64_t m5 = load64(block + 5 * sizeof(uint64_t)); + const uint64_t m6 = load64(block + 6 * sizeof(uint64_t)); + const uint64_t m7 = load64(block + 7 * sizeof(uint64_t)); + const uint64_t m8 = load64(block + 8 * sizeof(uint64_t)); + const uint64_t m9 = load64(block + 9 * sizeof(uint64_t)); + const uint64_t m10 = load64(block + 10 * sizeof(uint64_t)); + const uint64_t m11 = load64(block + 11 * sizeof(uint64_t)); + const uint64_t m12 = load64(block + 12 * sizeof(uint64_t)); + const uint64_t m13 = load64(block + 13 * sizeof(uint64_t)); + const uint64_t m14 = load64(block + 14 * sizeof(uint64_t)); + const uint64_t m15 = load64(block + 15 * sizeof(uint64_t)); +#endif + row1l = LOADU( &S->h[0] ); + row1h = LOADU( &S->h[2] ); + row2l = LOADU( &S->h[4] ); + row2h = LOADU( &S->h[6] ); + row3l = LOADU( &blake2b_IV[0] ); + row3h = LOADU( &blake2b_IV[2] ); + row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); + row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); + ROUND( 0 ); + ROUND( 1 ); + ROUND( 2 ); + ROUND( 3 ); + ROUND( 4 ); + ROUND( 5 ); + ROUND( 6 ); + ROUND( 7 ); + ROUND( 8 ); + ROUND( 9 ); + ROUND( 10 ); + ROUND( 11 ); + row1l = _mm_xor_si128( row3l, row1l ); + row1h = _mm_xor_si128( row3h, row1h ); + STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); + STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); + row2l = _mm_xor_si128( row4l, row2l ); + row2h = _mm_xor_si128( row4h, row2h ); + STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); + STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); +} + + +int _cryptonite_blake2b_update( blake2b_state *S, const void *pin, size_t inlen ) +{ + const unsigned char * in = (const unsigned char *)pin; + if( inlen > 0 ) + { + size_t left = S->buflen; + size_t fill = BLAKE2B_BLOCKBYTES - left; + if( inlen > fill ) + { + S->buflen = 0; + memcpy( S->buf + left, in, fill ); /* Fill buffer */ + blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); + blake2b_compress( S, S->buf ); /* Compress */ + in += fill; inlen -= fill; + while(inlen > BLAKE2B_BLOCKBYTES) { + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress( S, in ); + in += BLAKE2B_BLOCKBYTES; + inlen -= BLAKE2B_BLOCKBYTES; + } + } + memcpy( S->buf + S->buflen, in, inlen ); + S->buflen += inlen; + } + return 0; +} + + +int _cryptonite_blake2b_final( blake2b_state *S, void *out, size_t outlen ) +{ + if( out == NULL || outlen < S->outlen ) + return -1; + + if( blake2b_is_lastblock( S ) ) + return -1; + + blake2b_increment_counter( S, S->buflen ); + blake2b_set_lastblock( S ); + memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ + blake2b_compress( S, S->buf ); + + memcpy( out, &S->h[0], S->outlen ); + return 0; +} + + +int _cryptonite_blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) +{ + blake2b_state S[1]; + + /* Verify parameters */ + if ( NULL == in && inlen > 0 ) return -1; + + if ( NULL == out ) return -1; + + if( NULL == key && keylen > 0 ) return -1; + + if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1; + + if( keylen > BLAKE2B_KEYBYTES ) return -1; + + if( keylen ) + { + if( _cryptonite_blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1; + } + else + { + if( _cryptonite_blake2b_init( S, outlen ) < 0 ) return -1; + } + + _cryptonite_blake2b_update( S, ( const uint8_t * )in, inlen ); + _cryptonite_blake2b_final( S, out, outlen ); + return 0; +} + +int _cryptonite_blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) { + return _cryptonite_blake2b(out, outlen, in, inlen, key, keylen); +} + +#if defined(SUPERCOP) +int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen ) +{ + return _cryptonite_blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 ); +} +#endif + +#if defined(BLAKE2B_SELFTEST) +#include +#include "blake2-kat.h" +int main( void ) +{ + uint8_t key[BLAKE2B_KEYBYTES]; + uint8_t buf[BLAKE2_KAT_LENGTH]; + size_t i, step; + + for( i = 0; i < BLAKE2B_KEYBYTES; ++i ) + key[i] = ( uint8_t )i; + + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + buf[i] = ( uint8_t )i; + + /* Test simple API */ + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + { + uint8_t hash[BLAKE2B_OUTBYTES]; + blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES ); + + if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) ) + { + goto fail; + } + } + + /* Test streaming API */ + for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) { + for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) { + uint8_t hash[BLAKE2B_OUTBYTES]; + blake2b_state S; + uint8_t * p = buf; + size_t mlen = i; + int err = 0; + + if( (err = _cryptonite_blake2b_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) { + goto fail; + } + + while (mlen >= step) { + if ( (err = _cryptonite_blake2b_update(&S, p, step)) < 0 ) { + goto fail; + } + mlen -= step; + p += step; + } + if ( (err = _cryptonite_blake2b_update(&S, p, mlen)) < 0) { + goto fail; + } + if ( (err = _cryptonite_blake2b_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) { + goto fail; + } + + if (0 != memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) { + goto fail; + } + } + } + + puts( "ok" ); + return 0; +fail: + puts("error"); + return -1; +} +#endif diff --git a/bundled/cbits/blake2/sse/blake2bp.c b/bundled/cbits/blake2/sse/blake2bp.c new file mode 100644 index 0000000..b51edb4 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2bp.c @@ -0,0 +1,361 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#include +#include +#include +#include + +#if defined(_OPENMP) +#include +#endif + +#include "blake2.h" +#include "blake2-impl.h" + +#define PARALLELISM_DEGREE 4 + +/* + blake2b_init_param defaults to setting the expecting output length + from the digest_length parameter block field. + + In some cases, however, we do not want this, as the output length + of these instances is given by inner_length instead. +*/ +static int blake2bp_init_leaf_param( blake2b_state *S, const blake2b_param *P ) +{ + int err = _cryptonite_blake2b_init_param(S, P); + S->outlen = P->inner_length; + return err; +} + +static int blake2bp_init_leaf( blake2b_state *S, size_t outlen, size_t keylen, uint64_t offset ) +{ + blake2b_param P[1]; + P->digest_length = (uint8_t)outlen; + P->key_length = (uint8_t)keylen; + P->fanout = PARALLELISM_DEGREE; + P->depth = 2; + P->leaf_length = 0; + P->node_offset = offset; + P->xof_length = 0; + P->node_depth = 0; + P->inner_length = BLAKE2B_OUTBYTES; + memset( P->reserved, 0, sizeof( P->reserved ) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + return blake2bp_init_leaf_param( S, P ); +} + +static int blake2bp_init_root( blake2b_state *S, size_t outlen, size_t keylen ) +{ + blake2b_param P[1]; + P->digest_length = (uint8_t)outlen; + P->key_length = (uint8_t)keylen; + P->fanout = PARALLELISM_DEGREE; + P->depth = 2; + P->leaf_length = 0; + P->node_offset = 0; + P->xof_length = 0; + P->node_depth = 1; + P->inner_length = BLAKE2B_OUTBYTES; + memset( P->reserved, 0, sizeof( P->reserved ) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + return _cryptonite_blake2b_init_param( S, P ); +} + + +int _cryptonite_blake2bp_init( blake2bp_state *S, size_t outlen ) +{ + size_t i; + if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1; + + memset( S->buf, 0, sizeof( S->buf ) ); + S->buflen = 0; + S->outlen = outlen; + + if( blake2bp_init_root( S->R, outlen, 0 ) < 0 ) + return -1; + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + if( blake2bp_init_leaf( S->S[i], outlen, 0, i ) < 0 ) return -1; + + S->R->last_node = 1; + S->S[PARALLELISM_DEGREE - 1]->last_node = 1; + return 0; +} + +int _cryptonite_blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen ) +{ + size_t i; + + if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1; + + if( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1; + + memset( S->buf, 0, sizeof( S->buf ) ); + S->buflen = 0; + S->outlen = outlen; + + if( blake2bp_init_root( S->R, outlen, keylen ) < 0 ) + return -1; + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + if( blake2bp_init_leaf( S->S[i], outlen, keylen, i ) < 0 ) return -1; + + S->R->last_node = 1; + S->S[PARALLELISM_DEGREE - 1]->last_node = 1; + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset( block, 0, BLAKE2B_BLOCKBYTES ); + memcpy( block, key, keylen ); + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2b_update( S->S[i], block, BLAKE2B_BLOCKBYTES ); + + secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */ + } + return 0; +} + + +int _cryptonite_blake2bp_update( blake2bp_state *S, const void *pin, size_t inlen ) +{ + const unsigned char * in = (const unsigned char *)pin; + size_t left = S->buflen; + size_t fill = sizeof( S->buf ) - left; + size_t i; + + if( left && inlen >= fill ) + { + memcpy( S->buf + left, in, fill ); + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2b_update( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); + + in += fill; + inlen -= fill; + left = 0; + } + +#if defined(_OPENMP) + #pragma omp parallel shared(S), num_threads(PARALLELISM_DEGREE) +#else + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) +#endif + { +#if defined(_OPENMP) + size_t i = omp_get_thread_num(); +#endif + size_t inlen__ = inlen; + const unsigned char *in__ = ( const unsigned char * )in; + in__ += i * BLAKE2B_BLOCKBYTES; + + while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES ) + { + _cryptonite_blake2b_update( S->S[i], in__, BLAKE2B_BLOCKBYTES ); + in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES; + inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES; + } + } + + in += inlen - inlen % ( PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES ); + inlen %= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES; + + if( inlen > 0 ) + memcpy( S->buf + left, in, inlen ); + + S->buflen = left + inlen; + return 0; +} + + + +int _cryptonite_blake2bp_final( blake2bp_state *S, void *out, size_t outlen ) +{ + uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES]; + size_t i; + + if(out == NULL || outlen < S->outlen) { + return -1; + } + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + { + if( S->buflen > i * BLAKE2B_BLOCKBYTES ) + { + size_t left = S->buflen - i * BLAKE2B_BLOCKBYTES; + + if( left > BLAKE2B_BLOCKBYTES ) left = BLAKE2B_BLOCKBYTES; + + _cryptonite_blake2b_update( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, left ); + } + + _cryptonite_blake2b_final( S->S[i], hash[i], BLAKE2B_OUTBYTES ); + } + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2b_update( S->R, hash[i], BLAKE2B_OUTBYTES ); + + return _cryptonite_blake2b_final( S->R, out, S->outlen ); +} + +int _cryptonite_blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) +{ + uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES]; + blake2b_state S[PARALLELISM_DEGREE][1]; + blake2b_state FS[1]; + size_t i; + + /* Verify parameters */ + if ( NULL == in && inlen > 0 ) return -1; + + if ( NULL == out ) return -1; + + if( NULL == key && keylen > 0 ) return -1; + + if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1; + + if( keylen > BLAKE2B_KEYBYTES ) return -1; + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + if( blake2bp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1; + + S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */ + + if( keylen > 0 ) + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset( block, 0, BLAKE2B_BLOCKBYTES ); + memcpy( block, key, keylen ); + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2b_update( S[i], block, BLAKE2B_BLOCKBYTES ); + + secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */ + } + +#if defined(_OPENMP) + #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE) +#else + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) +#endif + { +#if defined(_OPENMP) + size_t i = omp_get_thread_num(); +#endif + size_t inlen__ = inlen; + const unsigned char *in__ = ( const unsigned char * )in; + in__ += i * BLAKE2B_BLOCKBYTES; + + while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES ) + { + _cryptonite_blake2b_update( S[i], in__, BLAKE2B_BLOCKBYTES ); + in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES; + inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES; + } + + if( inlen__ > i * BLAKE2B_BLOCKBYTES ) + { + const size_t left = inlen__ - i * BLAKE2B_BLOCKBYTES; + const size_t len = left <= BLAKE2B_BLOCKBYTES ? left : BLAKE2B_BLOCKBYTES; + _cryptonite_blake2b_update( S[i], in__, len ); + } + + _cryptonite_blake2b_final( S[i], hash[i], BLAKE2B_OUTBYTES ); + } + + if( blake2bp_init_root( FS, outlen, keylen ) < 0 ) + return -1; + + FS->last_node = 1; /* Mark as last node */ + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2b_update( FS, hash[i], BLAKE2B_OUTBYTES ); + + return _cryptonite_blake2b_final( FS, out, outlen ); +} + + +#if defined(BLAKE2BP_SELFTEST) +#include +#include "blake2-kat.h" +int main( void ) +{ + uint8_t key[BLAKE2B_KEYBYTES]; + uint8_t buf[BLAKE2_KAT_LENGTH]; + size_t i, step; + + for( i = 0; i < BLAKE2B_KEYBYTES; ++i ) + key[i] = ( uint8_t )i; + + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + buf[i] = ( uint8_t )i; + + /* Test simple API */ + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + { + uint8_t hash[BLAKE2B_OUTBYTES]; + _cryptonite_blake2bp( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES ); + + if( 0 != memcmp( hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES ) ) + { + goto fail; + } + } + + /* Test streaming API */ + for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) { + for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) { + uint8_t hash[BLAKE2B_OUTBYTES]; + blake2bp_state S; + uint8_t * p = buf; + size_t mlen = i; + int err = 0; + + if( (err = _cryptonite_blake2bp_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) { + goto fail; + } + + while (mlen >= step) { + if ( (err = _cryptonite_blake2bp_update(&S, p, step)) < 0 ) { + goto fail; + } + mlen -= step; + p += step; + } + if ( (err = _cryptonite_blake2bp_update(&S, p, mlen)) < 0) { + goto fail; + } + if ( (err = _cryptonite_blake2bp_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) { + goto fail; + } + + if (0 != memcmp(hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES)) { + goto fail; + } + } + } + + puts( "ok" ); + return 0; +fail: + puts("error"); + return -1; +} +#endif diff --git a/bundled/cbits/blake2/sse/blake2s-load-sse2.h b/bundled/cbits/blake2/sse/blake2s-load-sse2.h new file mode 100644 index 0000000..d2e9a09 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2s-load-sse2.h @@ -0,0 +1,60 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2S_LOAD_SSE2_H +#define BLAKE2S_LOAD_SSE2_H + +#define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6,m4,m2,m0) +#define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7,m5,m3,m1) +#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m14,m12,m10,m8) +#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m15,m13,m11,m9) +#define LOAD_MSG_1_1(buf) buf = _mm_set_epi32(m13,m9,m4,m14) +#define LOAD_MSG_1_2(buf) buf = _mm_set_epi32(m6,m15,m8,m10) +#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m5,m11,m0,m1) +#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m3,m7,m2,m12) +#define LOAD_MSG_2_1(buf) buf = _mm_set_epi32(m15,m5,m12,m11) +#define LOAD_MSG_2_2(buf) buf = _mm_set_epi32(m13,m2,m0,m8) +#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m9,m7,m3,m10) +#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m4,m1,m6,m14) +#define LOAD_MSG_3_1(buf) buf = _mm_set_epi32(m11,m13,m3,m7) +#define LOAD_MSG_3_2(buf) buf = _mm_set_epi32(m14,m12,m1,m9) +#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m15,m4,m5,m2) +#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m8,m0,m10,m6) +#define LOAD_MSG_4_1(buf) buf = _mm_set_epi32(m10,m2,m5,m9) +#define LOAD_MSG_4_2(buf) buf = _mm_set_epi32(m15,m4,m7,m0) +#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m3,m6,m11,m14) +#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m13,m8,m12,m1) +#define LOAD_MSG_5_1(buf) buf = _mm_set_epi32(m8,m0,m6,m2) +#define LOAD_MSG_5_2(buf) buf = _mm_set_epi32(m3,m11,m10,m12) +#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m1,m15,m7,m4) +#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m9,m14,m5,m13) +#define LOAD_MSG_6_1(buf) buf = _mm_set_epi32(m4,m14,m1,m12) +#define LOAD_MSG_6_2(buf) buf = _mm_set_epi32(m10,m13,m15,m5) +#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m8,m9,m6,m0) +#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m11,m2,m3,m7) +#define LOAD_MSG_7_1(buf) buf = _mm_set_epi32(m3,m12,m7,m13) +#define LOAD_MSG_7_2(buf) buf = _mm_set_epi32(m9,m1,m14,m11) +#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m2,m8,m15,m5) +#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m10,m6,m4,m0) +#define LOAD_MSG_8_1(buf) buf = _mm_set_epi32(m0,m11,m14,m6) +#define LOAD_MSG_8_2(buf) buf = _mm_set_epi32(m8,m3,m9,m15) +#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m10,m1,m13,m12) +#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m5,m4,m7,m2) +#define LOAD_MSG_9_1(buf) buf = _mm_set_epi32(m1,m7,m8,m10) +#define LOAD_MSG_9_2(buf) buf = _mm_set_epi32(m5,m6,m4,m2) +#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m13,m3,m9,m15) +#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m0,m12,m14,m11) + + +#endif diff --git a/bundled/cbits/blake2/sse/blake2s-load-sse41.h b/bundled/cbits/blake2/sse/blake2s-load-sse41.h new file mode 100644 index 0000000..c316fb5 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2s-load-sse41.h @@ -0,0 +1,229 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2S_LOAD_SSE41_H +#define BLAKE2S_LOAD_SSE41_H + +#define LOAD_MSG_0_1(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); + +#define LOAD_MSG_0_2(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); + +#define LOAD_MSG_0_3(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); + +#define LOAD_MSG_0_4(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); + +#define LOAD_MSG_1_1(buf) \ +t0 = _mm_blend_epi16(m1, m2, 0x0C); \ +t1 = _mm_slli_si128(m3, 4); \ +t2 = _mm_blend_epi16(t0, t1, 0xF0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); + +#define LOAD_MSG_1_2(buf) \ +t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ +t1 = _mm_blend_epi16(m1,m3,0xC0); \ +t2 = _mm_blend_epi16(t0, t1, 0xF0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + +#define LOAD_MSG_1_3(buf) \ +t0 = _mm_slli_si128(m1, 4); \ +t1 = _mm_blend_epi16(m2, t0, 0x30); \ +t2 = _mm_blend_epi16(m0, t1, 0xF0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + +#define LOAD_MSG_1_4(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_slli_si128(m3, 4); \ +t2 = _mm_blend_epi16(t0, t1, 0x0C); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + +#define LOAD_MSG_2_1(buf) \ +t0 = _mm_unpackhi_epi32(m2,m3); \ +t1 = _mm_blend_epi16(m3,m1,0x0C); \ +t2 = _mm_blend_epi16(t0, t1, 0x0F); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + +#define LOAD_MSG_2_2(buf) \ +t0 = _mm_unpacklo_epi32(m2,m0); \ +t1 = _mm_blend_epi16(t0, m0, 0xF0); \ +t2 = _mm_slli_si128(m3, 8); \ +buf = _mm_blend_epi16(t1, t2, 0xC0); + +#define LOAD_MSG_2_3(buf) \ +t0 = _mm_blend_epi16(m0, m2, 0x3C); \ +t1 = _mm_srli_si128(m1, 12); \ +t2 = _mm_blend_epi16(t0,t1,0x03); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); + +#define LOAD_MSG_2_4(buf) \ +t0 = _mm_slli_si128(m3, 4); \ +t1 = _mm_blend_epi16(m0, m1, 0x33); \ +t2 = _mm_blend_epi16(t1, t0, 0xC0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); + +#define LOAD_MSG_3_1(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_unpackhi_epi32(t0, m2); \ +t2 = _mm_blend_epi16(t1, m3, 0x0C); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + +#define LOAD_MSG_3_2(buf) \ +t0 = _mm_slli_si128(m2, 8); \ +t1 = _mm_blend_epi16(m3,m0,0x0C); \ +t2 = _mm_blend_epi16(t1, t0, 0xC0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + +#define LOAD_MSG_3_3(buf) \ +t0 = _mm_blend_epi16(m0,m1,0x0F); \ +t1 = _mm_blend_epi16(t0, m3, 0xC0); \ +buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + +#define LOAD_MSG_3_4(buf) \ +t0 = _mm_unpacklo_epi32(m0,m2); \ +t1 = _mm_unpackhi_epi32(m1,m2); \ +buf = _mm_unpacklo_epi64(t1,t0); + +#define LOAD_MSG_4_1(buf) \ +t0 = _mm_unpacklo_epi64(m1,m2); \ +t1 = _mm_unpackhi_epi64(m0,m2); \ +t2 = _mm_blend_epi16(t0,t1,0x33); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + +#define LOAD_MSG_4_2(buf) \ +t0 = _mm_unpackhi_epi64(m1,m3); \ +t1 = _mm_unpacklo_epi64(m0,m1); \ +buf = _mm_blend_epi16(t0,t1,0x33); + +#define LOAD_MSG_4_3(buf) \ +t0 = _mm_unpackhi_epi64(m3,m1); \ +t1 = _mm_unpackhi_epi64(m2,m0); \ +buf = _mm_blend_epi16(t1,t0,0x33); + +#define LOAD_MSG_4_4(buf) \ +t0 = _mm_blend_epi16(m0,m2,0x03); \ +t1 = _mm_slli_si128(t0, 8); \ +t2 = _mm_blend_epi16(t1,m3,0x0F); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); + +#define LOAD_MSG_5_1(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_unpacklo_epi32(m0,m2); \ +buf = _mm_unpacklo_epi64(t0,t1); + +#define LOAD_MSG_5_2(buf) \ +t0 = _mm_srli_si128(m2, 4); \ +t1 = _mm_blend_epi16(m0,m3,0x03); \ +buf = _mm_blend_epi16(t1,t0,0x3C); + +#define LOAD_MSG_5_3(buf) \ +t0 = _mm_blend_epi16(m1,m0,0x0C); \ +t1 = _mm_srli_si128(m3, 4); \ +t2 = _mm_blend_epi16(t0,t1,0x30); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); + +#define LOAD_MSG_5_4(buf) \ +t0 = _mm_unpacklo_epi64(m1,m2); \ +t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ +buf = _mm_blend_epi16(t0,t1,0x33); + +#define LOAD_MSG_6_1(buf) \ +t0 = _mm_slli_si128(m1, 12); \ +t1 = _mm_blend_epi16(m0,m3,0x33); \ +buf = _mm_blend_epi16(t1,t0,0xC0); + +#define LOAD_MSG_6_2(buf) \ +t0 = _mm_blend_epi16(m3,m2,0x30); \ +t1 = _mm_srli_si128(m1, 4); \ +t2 = _mm_blend_epi16(t0,t1,0x03); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); + +#define LOAD_MSG_6_3(buf) \ +t0 = _mm_unpacklo_epi64(m0,m2); \ +t1 = _mm_srli_si128(m1, 4); \ +buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); + +#define LOAD_MSG_6_4(buf) \ +t0 = _mm_unpackhi_epi32(m1,m2); \ +t1 = _mm_unpackhi_epi64(m0,t0); \ +buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + +#define LOAD_MSG_7_1(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_blend_epi16(t0,m3,0x0F); \ +buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); + +#define LOAD_MSG_7_2(buf) \ +t0 = _mm_blend_epi16(m2,m3,0x30); \ +t1 = _mm_srli_si128(m0,4); \ +t2 = _mm_blend_epi16(t0,t1,0x03); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); + +#define LOAD_MSG_7_3(buf) \ +t0 = _mm_unpackhi_epi64(m0,m3); \ +t1 = _mm_unpacklo_epi64(m1,m2); \ +t2 = _mm_blend_epi16(t0,t1,0x3C); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); + +#define LOAD_MSG_7_4(buf) \ +t0 = _mm_unpacklo_epi32(m0,m1); \ +t1 = _mm_unpackhi_epi32(m1,m2); \ +buf = _mm_unpacklo_epi64(t0,t1); + +#define LOAD_MSG_8_1(buf) \ +t0 = _mm_unpackhi_epi32(m1,m3); \ +t1 = _mm_unpacklo_epi64(t0,m0); \ +t2 = _mm_blend_epi16(t1,m2,0xC0); \ +buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); + +#define LOAD_MSG_8_2(buf) \ +t0 = _mm_unpackhi_epi32(m0,m3); \ +t1 = _mm_blend_epi16(m2,t0,0xF0); \ +buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); + +#define LOAD_MSG_8_3(buf) \ +t0 = _mm_blend_epi16(m2,m0,0x0C); \ +t1 = _mm_slli_si128(t0,4); \ +buf = _mm_blend_epi16(t1,m3,0x0F); + +#define LOAD_MSG_8_4(buf) \ +t0 = _mm_blend_epi16(m1,m0,0x30); \ +buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); + +#define LOAD_MSG_9_1(buf) \ +t0 = _mm_blend_epi16(m0,m2,0x03); \ +t1 = _mm_blend_epi16(m1,m2,0x30); \ +t2 = _mm_blend_epi16(t1,t0,0x0F); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); + +#define LOAD_MSG_9_2(buf) \ +t0 = _mm_slli_si128(m0,4); \ +t1 = _mm_blend_epi16(m1,t0,0xC0); \ +buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); + +#define LOAD_MSG_9_3(buf) \ +t0 = _mm_unpackhi_epi32(m0,m3); \ +t1 = _mm_unpacklo_epi32(m2,m3); \ +t2 = _mm_unpackhi_epi64(t0,t1); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); + +#define LOAD_MSG_9_4(buf) \ +t0 = _mm_blend_epi16(m3,m2,0xC0); \ +t1 = _mm_unpacklo_epi32(m0,m3); \ +t2 = _mm_blend_epi16(t0,t1,0x0F); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); + +#endif diff --git a/bundled/cbits/blake2/sse/blake2s-load-xop.h b/bundled/cbits/blake2/sse/blake2s-load-xop.h new file mode 100644 index 0000000..a97ddcc --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2s-load-xop.h @@ -0,0 +1,191 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2S_LOAD_XOP_H +#define BLAKE2S_LOAD_XOP_H + +#define TOB(x) ((x)*4*0x01010101 + 0x03020100) /* ..or not TOB */ + +#if 0 +/* Basic VPPERM emulation, for testing purposes */ +static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel) +{ + const __m128i sixteen = _mm_set1_epi8(16); + const __m128i t0 = _mm_shuffle_epi8(src1, sel); + const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen)); + const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen), + _mm_cmpgt_epi8(sel, sixteen)); /* (>=16) = 0xff : 00 */ + return _mm_blendv_epi8(t0, s1, mask); +} +#endif + +#define LOAD_MSG_0_1(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); + +#define LOAD_MSG_0_2(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); + +#define LOAD_MSG_0_3(buf) \ +buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); + +#define LOAD_MSG_0_4(buf) \ +buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); + +#define LOAD_MSG_1_1(buf) \ +t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_1_2(buf) \ +t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); + +#define LOAD_MSG_1_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); + +#define LOAD_MSG_1_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); + +#define LOAD_MSG_2_1(buf) \ +t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(4),TOB(0)) ); + +#define LOAD_MSG_2_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(2),TOB(0),TOB(4)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_2_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_2_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_3_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) ); + +#define LOAD_MSG_3_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) ); + +#define LOAD_MSG_3_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_3_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) ); + +#define LOAD_MSG_4_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(5)) ); + +#define LOAD_MSG_4_2(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_4_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_4_4(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) ); + +#define LOAD_MSG_5_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_5_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(6),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); + +#define LOAD_MSG_5_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); + +#define LOAD_MSG_5_4(buf) \ +t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) ); + +#define LOAD_MSG_6_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(4)) ); + +#define LOAD_MSG_6_2(buf) \ +t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(6),TOB(0),TOB(0),TOB(1)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) ); + +#define LOAD_MSG_6_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) ); + +#define LOAD_MSG_6_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \ +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_7_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(5)) ); + +#define LOAD_MSG_7_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); + +#define LOAD_MSG_7_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); + +#define LOAD_MSG_7_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_8_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); + +#define LOAD_MSG_8_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) ); + +#define LOAD_MSG_8_3(buf) \ +t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \ + +#define LOAD_MSG_8_4(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) ); + +#define LOAD_MSG_9_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) ); + +#define LOAD_MSG_9_2(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) ); + +#define LOAD_MSG_9_3(buf) \ +t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) ); + +#define LOAD_MSG_9_4(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) ); + +#endif diff --git a/bundled/cbits/blake2/sse/blake2s-round.h b/bundled/cbits/blake2/sse/blake2s-round.h new file mode 100644 index 0000000..44a5574 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2s-round.h @@ -0,0 +1,88 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ +#ifndef BLAKE2S_ROUND_H +#define BLAKE2S_ROUND_H + +#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +#define LIKELY(x) __builtin_expect((x),1) + + +/* Microarchitecture-specific macros */ +#ifndef HAVE_XOP +#ifdef HAVE_SSSE3 +#define _mm_roti_epi32(r, c) ( \ + (8==-(c)) ? _mm_shuffle_epi8(r,r8) \ + : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \ + : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) ) +#else +#define _mm_roti_epi32(r, c) _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) +#endif +#else +/* ... */ +#endif + + +#define G1(row1,row2,row3,row4,buf) \ + row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = _mm_roti_epi32(row4, -16); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ + row2 = _mm_roti_epi32(row2, -12); + +#define G2(row1,row2,row3,row4,buf) \ + row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = _mm_roti_epi32(row4, -8); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ + row2 = _mm_roti_epi32(row2, -7); + +#define DIAGONALIZE(row1,row2,row3,row4) \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ + row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); + +#define UNDIAGONALIZE(row1,row2,row3,row4) \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ + row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); + +#if defined(HAVE_XOP) +#include "blake2s-load-xop.h" +#elif defined(HAVE_SSE41) +#include "blake2s-load-sse41.h" +#else +#include "blake2s-load-sse2.h" +#endif + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(buf1); \ + G1(row1,row2,row3,row4,buf1); \ + LOAD_MSG_ ##r ##_2(buf2); \ + G2(row1,row2,row3,row4,buf2); \ + DIAGONALIZE(row1,row2,row3,row4); \ + LOAD_MSG_ ##r ##_3(buf3); \ + G1(row1,row2,row3,row4,buf3); \ + LOAD_MSG_ ##r ##_4(buf4); \ + G2(row1,row2,row3,row4,buf4); \ + UNDIAGONALIZE(row1,row2,row3,row4); \ + +#endif diff --git a/bundled/cbits/blake2/sse/blake2s.c b/bundled/cbits/blake2/sse/blake2s.c new file mode 100644 index 0000000..b983465 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2s.c @@ -0,0 +1,363 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#include +#include +#include + +#include "blake2.h" +#include "blake2-impl.h" + +#include "blake2-config.h" + + +#include +#if defined(HAVE_SSSE3) +#include +#endif +#if defined(HAVE_SSE41) +#include +#endif +#if defined(HAVE_AVX) +#include +#endif +#if defined(HAVE_XOP) +#include +#endif + +#include "blake2s-round.h" + +static const uint32_t blake2s_IV[8] = +{ + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + +/* Some helper functions */ +static void blake2s_set_lastnode( blake2s_state *S ) +{ + S->f[1] = (uint32_t)-1; +} + +static int blake2s_is_lastblock( const blake2s_state *S ) +{ + return S->f[0] != 0; +} + +static void blake2s_set_lastblock( blake2s_state *S ) +{ + if( S->last_node ) blake2s_set_lastnode( S ); + + S->f[0] = (uint32_t)-1; +} + +static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc ) +{ + uint64_t t = ( ( uint64_t )S->t[1] << 32 ) | S->t[0]; + t += inc; + S->t[0] = ( uint32_t )( t >> 0 ); + S->t[1] = ( uint32_t )( t >> 32 ); +} + +/* init2 xors IV with input parameter block */ +int _cryptonite_blake2s_init_param( blake2s_state *S, const blake2s_param *P ) +{ + size_t i; + /*blake2s_init0( S ); */ + const uint8_t * v = ( const uint8_t * )( blake2s_IV ); + const uint8_t * p = ( const uint8_t * )( P ); + uint8_t * h = ( uint8_t * )( S->h ); + /* IV XOR ParamBlock */ + memset( S, 0, sizeof( blake2s_state ) ); + + for( i = 0; i < BLAKE2S_OUTBYTES; ++i ) h[i] = v[i] ^ p[i]; + + S->outlen = P->digest_length; + return 0; +} + + +/* Some sort of default parameter block initialization, for sequential blake2s */ +int _cryptonite_blake2s_init( blake2s_state *S, size_t outlen ) +{ + blake2s_param P[1]; + + /* Move interval verification here? */ + if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; + + P->digest_length = (uint8_t)outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store32( &P->node_offset, 0 ); + store16( &P->xof_length, 0 ); + P->node_depth = 0; + P->inner_length = 0; + /* memset(P->reserved, 0, sizeof(P->reserved) ); */ + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + + return _cryptonite_blake2s_init_param( S, P ); +} + + +int _cryptonite_blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen ) +{ + blake2s_param P[1]; + + /* Move interval verification here? */ + if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; + + if ( ( !key ) || ( !keylen ) || keylen > BLAKE2S_KEYBYTES ) return -1; + + P->digest_length = (uint8_t)outlen; + P->key_length = (uint8_t)keylen; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store32( &P->node_offset, 0 ); + store16( &P->xof_length, 0 ); + P->node_depth = 0; + P->inner_length = 0; + /* memset(P->reserved, 0, sizeof(P->reserved) ); */ + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + + if( _cryptonite_blake2s_init_param( S, P ) < 0 ) + return -1; + + { + uint8_t block[BLAKE2S_BLOCKBYTES]; + memset( block, 0, BLAKE2S_BLOCKBYTES ); + memcpy( block, key, keylen ); + _cryptonite_blake2s_update( S, block, BLAKE2S_BLOCKBYTES ); + secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ + } + return 0; +} + + +static void blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) +{ + __m128i row1, row2, row3, row4; + __m128i buf1, buf2, buf3, buf4; +#if defined(HAVE_SSE41) + __m128i t0, t1; +#if !defined(HAVE_XOP) + __m128i t2; +#endif +#endif + __m128i ff0, ff1; +#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) + const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); + const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); +#endif +#if defined(HAVE_SSE41) + const __m128i m0 = LOADU( block + 00 ); + const __m128i m1 = LOADU( block + 16 ); + const __m128i m2 = LOADU( block + 32 ); + const __m128i m3 = LOADU( block + 48 ); +#else + const uint32_t m0 = load32(block + 0 * sizeof(uint32_t)); + const uint32_t m1 = load32(block + 1 * sizeof(uint32_t)); + const uint32_t m2 = load32(block + 2 * sizeof(uint32_t)); + const uint32_t m3 = load32(block + 3 * sizeof(uint32_t)); + const uint32_t m4 = load32(block + 4 * sizeof(uint32_t)); + const uint32_t m5 = load32(block + 5 * sizeof(uint32_t)); + const uint32_t m6 = load32(block + 6 * sizeof(uint32_t)); + const uint32_t m7 = load32(block + 7 * sizeof(uint32_t)); + const uint32_t m8 = load32(block + 8 * sizeof(uint32_t)); + const uint32_t m9 = load32(block + 9 * sizeof(uint32_t)); + const uint32_t m10 = load32(block + 10 * sizeof(uint32_t)); + const uint32_t m11 = load32(block + 11 * sizeof(uint32_t)); + const uint32_t m12 = load32(block + 12 * sizeof(uint32_t)); + const uint32_t m13 = load32(block + 13 * sizeof(uint32_t)); + const uint32_t m14 = load32(block + 14 * sizeof(uint32_t)); + const uint32_t m15 = load32(block + 15 * sizeof(uint32_t)); +#endif + row1 = ff0 = LOADU( &S->h[0] ); + row2 = ff1 = LOADU( &S->h[4] ); + row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] ); + row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &S->t[0] ) ); + ROUND( 0 ); + ROUND( 1 ); + ROUND( 2 ); + ROUND( 3 ); + ROUND( 4 ); + ROUND( 5 ); + ROUND( 6 ); + ROUND( 7 ); + ROUND( 8 ); + ROUND( 9 ); + STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); + STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); +} + +int _cryptonite_blake2s_update( blake2s_state *S, const void *pin, size_t inlen ) +{ + const unsigned char * in = (const unsigned char *)pin; + if( inlen > 0 ) + { + size_t left = S->buflen; + size_t fill = BLAKE2S_BLOCKBYTES - left; + if( inlen > fill ) + { + S->buflen = 0; + memcpy( S->buf + left, in, fill ); /* Fill buffer */ + blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); + blake2s_compress( S, S->buf ); /* Compress */ + in += fill; inlen -= fill; + while(inlen > BLAKE2S_BLOCKBYTES) { + blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES); + blake2s_compress( S, in ); + in += BLAKE2S_BLOCKBYTES; + inlen -= BLAKE2S_BLOCKBYTES; + } + } + memcpy( S->buf + S->buflen, in, inlen ); + S->buflen += inlen; + } + return 0; +} + +int _cryptonite_blake2s_final( blake2s_state *S, void *out, size_t outlen ) +{ + uint8_t buffer[BLAKE2S_OUTBYTES] = {0}; + size_t i; + + if( out == NULL || outlen < S->outlen ) + return -1; + + if( blake2s_is_lastblock( S ) ) + return -1; + + blake2s_increment_counter( S, (uint32_t)S->buflen ); + blake2s_set_lastblock( S ); + memset( S->buf + S->buflen, 0, BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */ + blake2s_compress( S, S->buf ); + + for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ + store32( buffer + sizeof( S->h[i] ) * i, S->h[i] ); + + memcpy( out, buffer, S->outlen ); + secure_zero_memory( buffer, sizeof(buffer) ); + return 0; +} + +/* inlen, at least, should be uint64_t. Others can be size_t. */ +int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) +{ + blake2s_state S[1]; + + /* Verify parameters */ + if ( NULL == in && inlen > 0 ) return -1; + + if ( NULL == out ) return -1; + + if ( NULL == key && keylen > 0) return -1; + + if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1; + + if( keylen > BLAKE2S_KEYBYTES ) return -1; + + if( keylen > 0 ) + { + if( _cryptonite_blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1; + } + else + { + if( _cryptonite_blake2s_init( S, outlen ) < 0 ) return -1; + } + + _cryptonite_blake2s_update( S, ( const uint8_t * )in, inlen ); + _cryptonite_blake2s_final( S, out, outlen ); + return 0; +} + +#if defined(SUPERCOP) +int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen ) +{ + return blake2s( out, BLAKE2S_OUTBYTES, in, inlen, NULL, 0 ); +} +#endif + +#if defined(BLAKE2S_SELFTEST) +#include +#include "blake2-kat.h" +int main( void ) +{ + uint8_t key[BLAKE2S_KEYBYTES]; + uint8_t buf[BLAKE2_KAT_LENGTH]; + size_t i, step; + + for( i = 0; i < BLAKE2S_KEYBYTES; ++i ) + key[i] = ( uint8_t )i; + + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + buf[i] = ( uint8_t )i; + + /* Test simple API */ + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + { + uint8_t hash[BLAKE2S_OUTBYTES]; + blake2s( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES ); + + if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) ) + { + goto fail; + } + } + + /* Test streaming API */ + for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) { + for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) { + uint8_t hash[BLAKE2S_OUTBYTES]; + blake2s_state S; + uint8_t * p = buf; + size_t mlen = i; + int err = 0; + + if( (err = _cryptonite_blake2s_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) { + goto fail; + } + + while (mlen >= step) { + if ( (err = _cryptonite_blake2s_update(&S, p, step)) < 0 ) { + goto fail; + } + mlen -= step; + p += step; + } + if ( (err = _cryptonite_blake2s_update(&S, p, mlen)) < 0) { + goto fail; + } + if ( (err = _cryptonite_blake2s_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) { + goto fail; + } + + if (0 != memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) { + goto fail; + } + } + } + + puts( "ok" ); + return 0; +fail: + puts("error"); + return -1; +} +#endif diff --git a/bundled/cbits/blake2/sse/blake2sp.c b/bundled/cbits/blake2/sse/blake2sp.c new file mode 100644 index 0000000..550c313 --- /dev/null +++ b/bundled/cbits/blake2/sse/blake2sp.c @@ -0,0 +1,358 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#include +#include +#include + +#if defined(_OPENMP) +#include +#endif + +#include "blake2.h" +#include "blake2-impl.h" + +#define PARALLELISM_DEGREE 8 + +/* + blake2sp_init_param defaults to setting the expecting output length + from the digest_length parameter block field. + + In some cases, however, we do not want this, as the output length + of these instances is given by inner_length instead. +*/ +static int blake2sp_init_leaf_param( blake2s_state *S, const blake2s_param *P ) +{ + int err = _cryptonite_blake2s_init_param(S, P); + S->outlen = P->inner_length; + return err; +} + +static int blake2sp_init_leaf( blake2s_state *S, size_t outlen, size_t keylen, uint64_t offset ) +{ + blake2s_param P[1]; + P->digest_length = (uint8_t)outlen; + P->key_length = (uint8_t)keylen; + P->fanout = PARALLELISM_DEGREE; + P->depth = 2; + P->leaf_length = 0; + P->node_offset = offset; + P->xof_length = 0; + P->node_depth = 0; + P->inner_length = BLAKE2S_OUTBYTES; + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + return blake2sp_init_leaf_param( S, P ); +} + +static int blake2sp_init_root( blake2s_state *S, size_t outlen, size_t keylen ) +{ + blake2s_param P[1]; + P->digest_length = (uint8_t)outlen; + P->key_length = (uint8_t)keylen; + P->fanout = PARALLELISM_DEGREE; + P->depth = 2; + P->leaf_length = 0; + P->node_offset = 0; + P->xof_length = 0; + P->node_depth = 1; + P->inner_length = BLAKE2S_OUTBYTES; + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + return _cryptonite_blake2s_init_param( S, P ); +} + + +int _cryptonite_blake2sp_init( blake2sp_state *S, size_t outlen ) +{ + size_t i; + + if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1; + + memset( S->buf, 0, sizeof( S->buf ) ); + S->buflen = 0; + S->outlen = outlen; + + if( blake2sp_init_root( S->R, outlen, 0 ) < 0 ) + return -1; + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + if( blake2sp_init_leaf( S->S[i], outlen, 0, i ) < 0 ) return -1; + + S->R->last_node = 1; + S->S[PARALLELISM_DEGREE - 1]->last_node = 1; + return 0; +} + +int _cryptonite_blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen ) +{ + size_t i; + + if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1; + + if( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1; + + memset( S->buf, 0, sizeof( S->buf ) ); + S->buflen = 0; + S->outlen = outlen; + + if( blake2sp_init_root( S->R, outlen, keylen ) < 0 ) + return -1; + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + if( blake2sp_init_leaf( S->S[i], outlen, keylen, i ) < 0 ) return -1; + + S->R->last_node = 1; + S->S[PARALLELISM_DEGREE - 1]->last_node = 1; + { + uint8_t block[BLAKE2S_BLOCKBYTES]; + memset( block, 0, BLAKE2S_BLOCKBYTES ); + memcpy( block, key, keylen ); + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2s_update( S->S[i], block, BLAKE2S_BLOCKBYTES ); + + secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ + } + return 0; +} + + +int _cryptonite_blake2sp_update( blake2sp_state *S, const void *pin, size_t inlen ) +{ + const unsigned char * in = (const unsigned char *)pin; + size_t left = S->buflen; + size_t fill = sizeof( S->buf ) - left; + size_t i; + + if( left && inlen >= fill ) + { + memcpy( S->buf + left, in, fill ); + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2s_update( S->S[i], S->buf + i * BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); + + in += fill; + inlen -= fill; + left = 0; + } + +#if defined(_OPENMP) + #pragma omp parallel shared(S), num_threads(PARALLELISM_DEGREE) +#else + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) +#endif + { +#if defined(_OPENMP) + size_t i = omp_get_thread_num(); +#endif + size_t inlen__ = inlen; + const unsigned char *in__ = ( const unsigned char * )in; + in__ += i * BLAKE2S_BLOCKBYTES; + + while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES ) + { + _cryptonite_blake2s_update( S->S[i], in__, BLAKE2S_BLOCKBYTES ); + in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES; + inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES; + } + } + + in += inlen - inlen % ( PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES ); + inlen %= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES; + + if( inlen > 0 ) + memcpy( S->buf + left, in, inlen ); + + S->buflen = left + inlen; + return 0; +} + + +int _cryptonite_blake2sp_final( blake2sp_state *S, void *out, size_t outlen ) +{ + uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES]; + size_t i; + + if(out == NULL || outlen < S->outlen) { + return -1; + } + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + { + if( S->buflen > i * BLAKE2S_BLOCKBYTES ) + { + size_t left = S->buflen - i * BLAKE2S_BLOCKBYTES; + + if( left > BLAKE2S_BLOCKBYTES ) left = BLAKE2S_BLOCKBYTES; + + _cryptonite_blake2s_update( S->S[i], S->buf + i * BLAKE2S_BLOCKBYTES, left ); + } + + _cryptonite_blake2s_final( S->S[i], hash[i], BLAKE2S_OUTBYTES ); + } + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2s_update( S->R, hash[i], BLAKE2S_OUTBYTES ); + + return _cryptonite_blake2s_final( S->R, out, S->outlen ); +} + + +int _cryptonite_blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) +{ + uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES]; + blake2s_state S[PARALLELISM_DEGREE][1]; + blake2s_state FS[1]; + size_t i; + + /* Verify parameters */ + if ( NULL == in && inlen > 0 ) return -1; + + if ( NULL == out ) return -1; + + if ( NULL == key && keylen > 0) return -1; + + if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1; + + if( keylen > BLAKE2S_KEYBYTES ) return -1; + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + if( blake2sp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1; + + S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */ + + if( keylen > 0 ) + { + uint8_t block[BLAKE2S_BLOCKBYTES]; + memset( block, 0, BLAKE2S_BLOCKBYTES ); + memcpy( block, key, keylen ); + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2s_update( S[i], block, BLAKE2S_BLOCKBYTES ); + + secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ + } + +#if defined(_OPENMP) + #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE) +#else + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) +#endif + { +#if defined(_OPENMP) + size_t i = omp_get_thread_num(); +#endif + size_t inlen__ = inlen; + const unsigned char *in__ = ( const unsigned char * )in; + in__ += i * BLAKE2S_BLOCKBYTES; + + while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES ) + { + _cryptonite_blake2s_update( S[i], in__, BLAKE2S_BLOCKBYTES ); + in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES; + inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES; + } + + if( inlen__ > i * BLAKE2S_BLOCKBYTES ) + { + const size_t left = inlen__ - i * BLAKE2S_BLOCKBYTES; + const size_t len = left <= BLAKE2S_BLOCKBYTES ? left : BLAKE2S_BLOCKBYTES; + _cryptonite_blake2s_update( S[i], in__, len ); + } + + _cryptonite_blake2s_final( S[i], hash[i], BLAKE2S_OUTBYTES ); + } + + if( blake2sp_init_root( FS, outlen, keylen ) < 0 ) + return -1; + + FS->last_node = 1; + + for( i = 0; i < PARALLELISM_DEGREE; ++i ) + _cryptonite_blake2s_update( FS, hash[i], BLAKE2S_OUTBYTES ); + + return _cryptonite_blake2s_final( FS, out, outlen ); +} + +#if defined(BLAKE2SP_SELFTEST) +#include +#include "blake2-kat.h" +int main( void ) +{ + uint8_t key[BLAKE2S_KEYBYTES]; + uint8_t buf[BLAKE2_KAT_LENGTH]; + size_t i, step; + + for( i = 0; i < BLAKE2S_KEYBYTES; ++i ) + key[i] = ( uint8_t )i; + + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + buf[i] = ( uint8_t )i; + + /* Test simple API */ + for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) + { + uint8_t hash[BLAKE2S_OUTBYTES]; + _cryptonite_blake2sp( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES ); + + if( 0 != memcmp( hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES ) ) + { + goto fail; + } + } + + /* Test streaming API */ + for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) { + for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) { + uint8_t hash[BLAKE2S_OUTBYTES]; + blake2sp_state S; + uint8_t * p = buf; + size_t mlen = i; + int err = 0; + + if( (err = _cryptonite_blake2sp_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) { + goto fail; + } + + while (mlen >= step) { + if ( (err = _cryptonite_blake2sp_update(&S, p, step)) < 0 ) { + goto fail; + } + mlen -= step; + p += step; + } + if ( (err = _cryptonite_blake2sp_update(&S, p, mlen)) < 0) { + goto fail; + } + if ( (err = _cryptonite_blake2sp_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) { + goto fail; + } + + if (0 != memcmp(hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES)) { + goto fail; + } + } + } + + puts( "ok" ); + return 0; +fail: + puts("error"); + return -1; +} +#endif diff --git a/src/Encryption.hs b/src/Encryption.hs new file mode 100644 index 0000000..106f4e7 --- /dev/null +++ b/src/Encryption.hs @@ -0,0 +1,74 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Encryption where + +import Control.Monad +import Crypto.Error +import Crypto.KDF.Argon2 +import Crypto.Random.Entropy +import Data.Binary.Get (getWord32be, runGet) +import Data.Binary.Put (putWord32be, runPut) +import Data.Bits (xor) +import Data.ByteString (ByteString) +import qualified Data.ByteString as B +import qualified Data.ByteString.Char8 as BC8 +import qualified Data.ByteString.Lazy as BL +import Data.Word (Word32) +import System.CPUTime +import System.IO + +-- picoseconds +measureIteration = do + start <- getCPUTime + throwaway <- throwCryptoErrorIO $ hash defaultOptions ("666" :: ByteString) ("solasgsdgdsgsd" :: ByteString) 32 :: IO ByteString + end <- seq throwaway getCPUTime + pure $ fromIntegral (end - start) / 1e12 + +getPassword = do + tty <- openFile "/dev/tty" ReadWriteMode + BC8.hPutStr tty "Password: " + hFlush tty + old <- hGetEcho tty + hSetEcho tty False + response <- BC8.hGetLine tty + hSetEcho tty old + BC8.hPutStrLn tty "" + hClose tty + pure response + +xorWithKey :: ByteString -> ByteString -> ByteString +xorWithKey key bs + | B.null key = error "key must not be empty" + | otherwise = B.pack $ B.zipWith xor bs key + +-- format: E[16-byte salt][4-byte big-endian iterations][ciphertext] +encryptData :: ByteString -> IO ByteString +encryptData privkey = do + iterationTime <- measureIteration + print iterationTime + let iterations = ceiling $ 1 / iterationTime + print iterations + salt <- getEntropy 16 :: IO ByteString + --let password = "password" :: ByteString + password <- getPassword + key <- throwCryptoErrorIO $ hash defaultOptions { iterations = iterations } password salt 56 :: IO ByteString + let ciphertext = xorWithKey key privkey + -- encode iterations as Word32 big-endian + let iterWord :: Word32 + iterWord = fromIntegral iterations + iterBytes = BL.toStrict $ runPut (putWord32be iterWord) + pure $ B.concat ["E", salt, iterBytes, ciphertext] + +decryptData :: ByteString -> IO ByteString +decryptData contents = do + password <- getPassword + -- parse salt (16 bytes), iterations (4 bytes BE), then ciphertext + when (B.length contents < 76) $ error "encrypted key is unexpectedly short" + let (salt, rest) = B.splitAt 16 contents + (iterBytes, ciphertext) = B.splitAt 4 rest + iterations = runGet getWord32be (BL.fromStrict iterBytes) + key <- throwCryptoErrorIO $ hash defaultOptions { iterations = iterations } password salt 56 :: IO ByteString + let plaintext = xorWithKey key ciphertext + case B.head plaintext of + 0x53 -> pure plaintext + _ -> error "wrong password" diff --git a/src/Main.hs b/src/Main.hs index 7f24263..4848314 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -7,16 +7,30 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Base64 as B64 import qualified Data.ByteString.Char8 as BC8 import qualified Data.Text as T +import qualified Data.Text.Encoding as T import qualified Data.Text.IO as T import Network.ONCRPC.XDR.Serial import Network.Stellar.TransactionXdr import qualified Stellar.Simple as S import System.Directory +import System.Environment import System.IO +import Encryption import Pretty main = do + args <- getArgs + case args of + ["encrypt"] -> encrypt + _ -> sign + +encrypt = do + encryptedFile <- T.encodeUtf8 <$> getPrivateKey + path <- getPrivateKeyPath + B.writeFile path =<< encryptData encryptedFile + +sign = do transactionB64 <- B.getContents let transaction = case B64.decode $ BC8.strip transactionB64 of Left err -> error err @@ -31,10 +45,17 @@ main = do let sd = S.signWithSecret key parsedTransaction T.putStrLn $ S.xdrSerializeBase64T sd -getPrivateKey = do +getPrivateKeyPath :: IO FilePath +getPrivateKeyPath = do home <- getHomeDirectory - keyFile <- T.readFile $ home ++ "/.stellar-veritas-key" - pure $ T.strip keyFile + pure $ home ++ "/.stellar-veritas-key" + +getPrivateKey = do + file <- B.readFile =<< getPrivateKeyPath + case B.head file of + 0x53 -> pure $ T.strip $ T.decodeUtf8 file -- bare unencrypted secret key + 0x45 -> T.decodeUtf8 <$> decryptData (B.tail file) + _ -> error "garbled private key file" confirm :: IO Bool confirm = do diff --git a/stellar-veritas.cabal b/stellar-veritas.cabal index df90513..6e0af8f 100644 --- a/stellar-veritas.cabal +++ b/stellar-veritas.cabal @@ -13,6 +13,7 @@ tested-with: GHC == 9.12.3 executable stellar-veritas main-is: Main.hs + CPP-options: -DWITH_BYTESTRING_SUPPORT ghc-options: -W -Wcompat -Wno-missing-home-modules -fno-warn-tabs -rtsopts=ignoreAll -optl-Wl,-s other-modules: build-depends: base >= 4.9 && < 5, @@ -46,4 +47,12 @@ executable stellar-veritas TypeOperators UnliftedFFITypes c-sources: bundled/cbits/ref10/ed25519.c - include-dirs: bundled/cbits/ref10 bundled/cbits/ref10/include + bundled/cbits/argon2/argon2.c + bundled/cbits/blake2/sse/blake2s.c + bundled/cbits/blake2/sse/blake2sp.c + bundled/cbits/blake2/sse/blake2b.c + bundled/cbits/blake2/sse/blake2bp.c + include-dirs: bundled/cbits/ref10 + bundled/cbits/ref10/include + bundled/cbits/argon2 + bundled/cbits/blake2/sse