From 559900fd3ac0135ab0e5f453fe4ba4b02d66abe1 Mon Sep 17 00:00:00 2001
From: La Ancapo <>
Date: Mon, 26 Jan 2026 23:09:58 +0100
Subject: [PATCH] basic key encryption

---
 README.md                                     |  14 +
 bundled/Crypto/KDF/Argon2.hs                  | 157 ++++
 bundled/cbits/argon2/argon2.c                 | 142 ++++
 bundled/cbits/argon2/argon2.h                 | 267 +++++++
 bundled/cbits/argon2/blamka-round-opt.h       | 180 +++++
 bundled/cbits/argon2/blamka-round-ref.h       |  56 ++
 bundled/cbits/argon2/core.c                   | 701 ++++++++++++++++++
 bundled/cbits/argon2/core.h                   | 234 ++++++
 bundled/cbits/argon2/opt.c                    | 186 +++++
 bundled/cbits/argon2/opt.h                    |  35 +
 bundled/cbits/argon2/ref.c                    | 185 +++++
 bundled/cbits/argon2/ref.h                    |  35 +
 bundled/cbits/argon2/thread.c                 |  57 ++
 bundled/cbits/argon2/thread.h                 |  67 ++
 bundled/cbits/blake2/sse/blake2-config.h      |  72 ++
 bundled/cbits/blake2/sse/blake2-impl.h        | 160 ++++
 bundled/cbits/blake2/sse/blake2.h             | 195 +++++
 bundled/cbits/blake2/sse/blake2b-load-sse2.h  |  68 ++
 bundled/cbits/blake2/sse/blake2b-load-sse41.h | 402 ++++++++++
 bundled/cbits/blake2/sse/blake2b-round.h      | 157 ++++
 bundled/cbits/blake2/sse/blake2b.c            | 373 ++++++++++
 bundled/cbits/blake2/sse/blake2bp.c           | 361 +++++++++
 bundled/cbits/blake2/sse/blake2s-load-sse2.h  |  60 ++
 bundled/cbits/blake2/sse/blake2s-load-sse41.h | 229 ++++++
 bundled/cbits/blake2/sse/blake2s-load-xop.h   | 191 +++++
 bundled/cbits/blake2/sse/blake2s-round.h      |  88 +++
 bundled/cbits/blake2/sse/blake2s.c            | 363 +++++++++
 bundled/cbits/blake2/sse/blake2sp.c           | 358 +++++++++
 src/Encryption.hs                             |  74 ++
 src/Main.hs                                   |  27 +-
 stellar-veritas.cabal                         |  11 +-
 31 files changed, 5501 insertions(+), 4 deletions(-)
 create mode 100644 bundled/Crypto/KDF/Argon2.hs
 create mode 100644 bundled/cbits/argon2/argon2.c
 create mode 100644 bundled/cbits/argon2/argon2.h
 create mode 100644 bundled/cbits/argon2/blamka-round-opt.h
 create mode 100644 bundled/cbits/argon2/blamka-round-ref.h
 create mode 100644 bundled/cbits/argon2/core.c
 create mode 100644 bundled/cbits/argon2/core.h
 create mode 100644 bundled/cbits/argon2/opt.c
 create mode 100644 bundled/cbits/argon2/opt.h
 create mode 100644 bundled/cbits/argon2/ref.c
 create mode 100644 bundled/cbits/argon2/ref.h
 create mode 100644 bundled/cbits/argon2/thread.c
 create mode 100644 bundled/cbits/argon2/thread.h
 create mode 100644 bundled/cbits/blake2/sse/blake2-config.h
 create mode 100644 bundled/cbits/blake2/sse/blake2-impl.h
 create mode 100644 bundled/cbits/blake2/sse/blake2.h
 create mode 100644 bundled/cbits/blake2/sse/blake2b-load-sse2.h
 create mode 100644 bundled/cbits/blake2/sse/blake2b-load-sse41.h
 create mode 100644 bundled/cbits/blake2/sse/blake2b-round.h
 create mode 100644 bundled/cbits/blake2/sse/blake2b.c
 create mode 100644 bundled/cbits/blake2/sse/blake2bp.c
 create mode 100644 bundled/cbits/blake2/sse/blake2s-load-sse2.h
 create mode 100644 bundled/cbits/blake2/sse/blake2s-load-sse41.h
 create mode 100644 bundled/cbits/blake2/sse/blake2s-load-xop.h
 create mode 100644 bundled/cbits/blake2/sse/blake2s-round.h
 create mode 100644 bundled/cbits/blake2/sse/blake2s.c
 create mode 100644 bundled/cbits/blake2/sse/blake2sp.c
 create mode 100644 src/Encryption.hs

diff --git a/README.md b/README.md
index 8492470..cb693ba 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,20 @@
 
 The aim is to create a trustworthy Stellar transaction signer (and, by necessity, a pretty printer) using only Glasgow Haskell compiler code and Haskell Core libraries, reducing the possible supply chain attack surface.
 
+## Installation and usage
+
 To build and run it, install `cabal-install` and use `cabal run`. The program expects your transaction coming from the standard input, and your private (S...) key residing in `$HOME/~/.stellar-veritas-key`. It will produce the decoded transaction description afterwards, with simple descriptions of the most popular operations. When it is ran from an interactive terminal, it will ask for a confirmation before signing the transaction.
 
+You can put the binary (`cabal list-bin stellar-veritas` to get the path) into your `$PATH` and use it directly.
+
+## Encryption
+
+The project supports key encryption. Encrypting your key protects you from key file theft (cracking a 8-character password is estimated to cost over $1M (in 2026), scales with password complexity and the performance of the computer at the moment of encryption).
+
+To encrypt your key, run `cabal run stellar-veritas -- encrypt` (or just `stellar-veritas encrypt` after installation) and enter your password. Afterwards you'll be prompted your password every time you want to sign anything. `encrypt`ing an already-encrpyted key would result in decryption and re-encryption with new parameters and salt.
+
+## Technical details
+
 The project contains the code of trimmed-down non-core dependencies, mainly cryptographic libraries. To avoid using bundled libraries (to build against the current Hackage), do the same in the `src` directory. To further reduce the amount of code under audit, weeder can be used, although the utility is dubious.
+
+Key encryption is implemented via XOR with Argon2i KDF with an adaptive iteration count.
diff --git a/bundled/Crypto/KDF/Argon2.hs b/bundled/Crypto/KDF/Argon2.hs
new file mode 100644
index 0000000..044ba00
--- /dev/null
+++ b/bundled/Crypto/KDF/Argon2.hs
@@ -0,0 +1,157 @@
+-- |
+-- Module      : Crypto.KDF.Argon2
+-- License     : BSD-style
+-- Maintainer  : Vincent Hanquez <vincent@snarc.org>
+-- Stability   : experimental
+-- Portability : unknown
+--
+-- Argon2 hashing function (P-H-C winner)
+--
+-- Recommended to use this module qualified
+--
+-- File started from Argon2.hs, from Oliver Charles
+-- at https://github.com/ocharles/argon2
+--
+module Crypto.KDF.Argon2
+    (
+      Options(..)
+    , TimeCost
+    , MemoryCost
+    , Parallelism
+    , Variant(..)
+    , Version(..)
+    , defaultOptions
+    -- * Hashing function
+    , hash
+    ) where
+
+import           Crypto.Internal.ByteArray (ByteArray, ByteArrayAccess)
+import qualified Crypto.Internal.ByteArray as B
+import           Crypto.Error
+import           Control.Monad (when)
+import           Data.Word
+import           Foreign.C
+import           Foreign.Ptr
+
+-- | Which variant of Argon2 to use. You should choose the variant that is most
+-- applicable to your intention to hash inputs.
+data Variant =
+      Argon2d  -- ^ Argon2d is faster than Argon2i and uses data-depending memory access,
+               -- which makes it suitable for cryptocurrencies and applications with no
+               -- threats from side-channel timing attacks.
+    | Argon2i  -- ^ Argon2i uses data-independent memory access, which is preferred
+               -- for password hashing and password-based key derivation. Argon2i
+               -- is slower as it makes more passes over the memory to protect from
+               -- tradeoff attacks.
+    | Argon2id -- ^ Argon2id is a hybrid of Argon2i and Argon2d, using a combination
+               -- of data-depending and data-independent memory accesses, which gives
+               -- some of Argon2i's resistance to side-channel cache timing attacks
+               -- and much of Argon2d's resistance to GPU cracking attacks
+    deriving (Eq,Ord,Read,Show,Enum,Bounded)
+
+-- | Which version of Argon2 to use
+data Version = Version10 | Version13
+    deriving (Eq,Ord,Read,Show,Enum,Bounded)
+
+-- | The time cost, which defines the amount of computation realized and therefore the execution time, given in number of iterations.
+--
+-- 'FFI.ARGON2_MIN_TIME' <= 'hashIterations' <= 'FFI.ARGON2_MAX_TIME'
+type TimeCost = Word32
+
+-- | The memory cost, which defines the memory usage, given in kibibytes.
+--
+-- max 'FFI.ARGON2_MIN_MEMORY' (8 * 'hashParallelism') <= 'hashMemory' <= 'FFI.ARGON2_MAX_MEMORY'
+type MemoryCost = Word32
+
+-- | A parallelism degree, which defines the number of parallel threads.
+--
+-- 'FFI.ARGON2_MIN_LANES' <= 'hashParallelism' <= 'FFI.ARGON2_MAX_LANES' && 'FFI.ARGON_MIN_THREADS' <= 'hashParallelism' <= 'FFI.ARGON2_MAX_THREADS'
+type Parallelism = Word32
+
+-- | Parameters that can be adjusted to change the runtime performance of the
+-- hashing.
+data Options = Options
+    { iterations  :: !TimeCost
+    , memory      :: !MemoryCost
+    , parallelism :: !Parallelism
+    , variant     :: !Variant     -- ^ Which variant of Argon2 to use.
+    , version     :: !Version     -- ^ Which version of Argon2 to use.
+    }
+    deriving (Eq,Ord,Read,Show)
+
+saltMinLength :: Int
+saltMinLength = 8
+
+outputMinLength :: Int
+outputMinLength = 4
+
+-- specification allows up to 2^32-1 but this is too big for a signed Int
+-- on a 32-bit architecture, so we limit tag length to 2^31-1 bytes
+outputMaxLength :: Int
+outputMaxLength = 0x7fffffff
+
+defaultOptions :: Options
+defaultOptions =
+    Options { iterations  = 1
+            , memory      = 2 ^ (17 :: Int)
+            , parallelism = 4
+            , variant     = Argon2i
+            , version     = Version13
+            }
+
+hash :: (ByteArrayAccess password, ByteArrayAccess salt, ByteArray out)
+     => Options
+     -> password
+     -> salt
+     -> Int
+     -> CryptoFailable out
+hash options password salt outLen
+    | saltLen < saltMinLength  = CryptoFailed CryptoError_SaltTooSmall
+    | outLen < outputMinLength = CryptoFailed CryptoError_OutputLengthTooSmall
+    | outLen > outputMaxLength = CryptoFailed CryptoError_OutputLengthTooBig
+    | otherwise                = CryptoPassed $ B.allocAndFreeze outLen $ \out -> do
+        res <- B.withByteArray password $ \pPass ->
+               B.withByteArray salt     $ \pSalt ->
+                    argon2_hash (iterations options)
+                                (memory options)
+                                (parallelism options)
+                                pPass
+                                (csizeOfInt passwordLen)
+                                pSalt
+                                (csizeOfInt saltLen)
+                                out
+                                (csizeOfInt outLen)
+                                (cOfVariant $ variant options)
+                                (cOfVersion $ version options)
+        when (res /= 0) $ error "argon2: hash: internal error"
+  where
+    saltLen = B.length salt
+    passwordLen = B.length password
+
+data Pass
+data Salt
+data HashOut
+
+type CVariant = CInt -- valid value is 0 (Argon2d), 1 (Argon2i) and 2 (Argon2id)
+type CVersion = CInt -- valid value is 0x10, 0x13
+
+cOfVersion :: Version -> CVersion
+cOfVersion Version10 = 0x10
+cOfVersion Version13 = 0x13
+
+cOfVariant :: Variant -> CVariant
+cOfVariant Argon2d  = 0
+cOfVariant Argon2i  = 1
+cOfVariant Argon2id = 2
+
+csizeOfInt :: Int -> CSize
+csizeOfInt = fromIntegral
+
+foreign import ccall unsafe "cryptonite_argon2_hash"
+    argon2_hash :: Word32 -> Word32 -> Word32
+                -> Ptr Pass -> CSize
+                -> Ptr Salt -> CSize
+                -> Ptr HashOut -> CSize
+                -> CVariant
+                -> CVersion
+                -> IO CInt
diff --git a/bundled/cbits/argon2/argon2.c b/bundled/cbits/argon2/argon2.c
new file mode 100644
index 0000000..717522e
--- /dev/null
+++ b/bundled/cbits/argon2/argon2.c
@@ -0,0 +1,142 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "argon2.h"
+#include "core.c"
+
+int cryptonite_argon2_ctx(argon2_context *context, argon2_type type) {
+    /* 1. Validate all inputs */
+    int result = validate_inputs(context);
+    uint32_t memory_blocks, segment_length;
+    argon2_instance_t instance;
+
+    if (ARGON2_OK != result) {
+        return result;
+    }
+
+    if (Argon2_d != type && Argon2_i != type && Argon2_id != type) {
+        return ARGON2_INCORRECT_TYPE;
+    }
+
+    /* 2. Align memory size */
+    /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */
+    memory_blocks = context->m_cost;
+
+    if (memory_blocks < 2 * ARGON2_SYNC_POINTS * context->lanes) {
+        memory_blocks = 2 * ARGON2_SYNC_POINTS * context->lanes;
+    }
+
+    segment_length = memory_blocks / (context->lanes * ARGON2_SYNC_POINTS);
+    /* Ensure that all segments have equal length */
+    memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS);
+
+    instance.version = context->version;
+    instance.memory = NULL;
+    instance.passes = context->t_cost;
+    instance.memory_blocks = memory_blocks;
+    instance.segment_length = segment_length;
+    instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
+    instance.lanes = context->lanes;
+    instance.threads = context->threads;
+    instance.type = type;
+
+    /* 3. Initialization: Hashing inputs, allocating memory, filling first
+     * blocks
+     */
+    result = initialize(&instance, context);
+
+    if (ARGON2_OK != result) {
+        return result;
+    }
+
+    /* 4. Filling memory */
+    result = fill_memory_blocks(&instance);
+
+    if (ARGON2_OK != result) {
+        return result;
+    }
+    /* 5. Finalization */
+    finalize(context, &instance);
+
+    return ARGON2_OK;
+}
+
+int cryptonite_argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
+                const uint32_t parallelism, const void *pwd,
+                const size_t pwdlen, const void *salt, const size_t saltlen,
+                void *hash, const size_t hashlen, argon2_type type,
+                const uint32_t version){
+
+    argon2_context context;
+    int result;
+    uint8_t *out;
+
+    if (hashlen > ARGON2_MAX_OUTLEN) {
+        return ARGON2_OUTPUT_TOO_LONG;
+    }
+
+    if (hashlen < ARGON2_MIN_OUTLEN) {
+        return ARGON2_OUTPUT_TOO_SHORT;
+    }
+
+    out = malloc(hashlen);
+    if (!out) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    context.out = (uint8_t *)out;
+    context.outlen = (uint32_t)hashlen;
+    context.pwd = CONST_CAST(uint8_t *)pwd;
+    context.pwdlen = (uint32_t)pwdlen;
+    context.salt = CONST_CAST(uint8_t *)salt;
+    context.saltlen = (uint32_t)saltlen;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.t_cost = t_cost;
+    context.m_cost = m_cost;
+    context.lanes = parallelism;
+    context.threads = parallelism;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = ARGON2_DEFAULT_FLAGS;
+    context.version = version;
+
+    result = cryptonite_argon2_ctx(&context, type);
+
+    if (result != ARGON2_OK) {
+        clear_internal_memory(out, hashlen);
+        free(out);
+        return result;
+    }
+
+    /* if raw hash requested, write it */
+    if (hash) {
+        memcpy(hash, out, hashlen);
+    }
+
+    clear_internal_memory(out, hashlen);
+    free(out);
+
+    return ARGON2_OK;
+}
+
diff --git a/bundled/cbits/argon2/argon2.h b/bundled/cbits/argon2/argon2.h
new file mode 100644
index 0000000..a4a088f
--- /dev/null
+++ b/bundled/cbits/argon2/argon2.h
@@ -0,0 +1,267 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_H
+#define ARGON2_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <limits.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Symbols visibility control */
+#ifdef A2_VISCTL
+#define ARGON2_PUBLIC __attribute__((visibility("default")))
+#elif _MSC_VER
+#define ARGON2_PUBLIC __declspec(dllexport)
+#else
+#define ARGON2_PUBLIC
+#endif
+
+/*
+ * Argon2 input parameter restrictions
+ */
+
+/* Minimum and maximum number of lanes (degree of parallelism) */
+#define ARGON2_MIN_LANES UINT32_C(1)
+#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF)
+
+/* Minimum and maximum number of threads */
+#define ARGON2_MIN_THREADS UINT32_C(1)
+#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF)
+
+/* Number of synchronization points between lanes per pass */
+#define ARGON2_SYNC_POINTS UINT32_C(4)
+
+/* Minimum and maximum digest size in bytes */
+#define ARGON2_MIN_OUTLEN UINT32_C(4)
+#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
+#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */
+
+#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b))
+/* Max memory size is addressing-space/2, topping at 2^32 blocks (4 TB) */
+#define ARGON2_MAX_MEMORY_BITS                                                 \
+    ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1))
+#define ARGON2_MAX_MEMORY                                                      \
+    ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS)
+
+/* Minimum and maximum number of passes */
+#define ARGON2_MIN_TIME UINT32_C(1)
+#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum password length in bytes */
+#define ARGON2_MIN_PWD_LENGTH UINT32_C(0)
+#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum associated data length in bytes */
+#define ARGON2_MIN_AD_LENGTH UINT32_C(0)
+#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum salt length in bytes */
+#define ARGON2_MIN_SALT_LENGTH UINT32_C(8)
+#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum key length in bytes */
+#define ARGON2_MIN_SECRET UINT32_C(0)
+#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF)
+
+/* Flags to determine which fields are securely wiped (default = no wipe). */
+#define ARGON2_DEFAULT_FLAGS UINT32_C(0)
+#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0)
+#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1)
+
+/* Global flag to determine if we are wiping internal memory buffers. This flag
+ * is defined in core.c and deafults to 1 (wipe internal memory). */
+//extern int FLAG_clear_internal_memory;
+
+/* Error codes */
+typedef enum Argon2_ErrorCodes {
+    ARGON2_OK = 0,
+
+    ARGON2_OUTPUT_PTR_NULL = -1,
+
+    ARGON2_OUTPUT_TOO_SHORT = -2,
+    ARGON2_OUTPUT_TOO_LONG = -3,
+
+    ARGON2_PWD_TOO_SHORT = -4,
+    ARGON2_PWD_TOO_LONG = -5,
+
+    ARGON2_SALT_TOO_SHORT = -6,
+    ARGON2_SALT_TOO_LONG = -7,
+
+    ARGON2_AD_TOO_SHORT = -8,
+    ARGON2_AD_TOO_LONG = -9,
+
+    ARGON2_SECRET_TOO_SHORT = -10,
+    ARGON2_SECRET_TOO_LONG = -11,
+
+    ARGON2_TIME_TOO_SMALL = -12,
+    ARGON2_TIME_TOO_LARGE = -13,
+
+    ARGON2_MEMORY_TOO_LITTLE = -14,
+    ARGON2_MEMORY_TOO_MUCH = -15,
+
+    ARGON2_LANES_TOO_FEW = -16,
+    ARGON2_LANES_TOO_MANY = -17,
+
+    ARGON2_PWD_PTR_MISMATCH = -18,    /* NULL ptr with non-zero length */
+    ARGON2_SALT_PTR_MISMATCH = -19,   /* NULL ptr with non-zero length */
+    ARGON2_SECRET_PTR_MISMATCH = -20, /* NULL ptr with non-zero length */
+    ARGON2_AD_PTR_MISMATCH = -21,     /* NULL ptr with non-zero length */
+
+    ARGON2_MEMORY_ALLOCATION_ERROR = -22,
+
+    ARGON2_FREE_MEMORY_CBK_NULL = -23,
+    ARGON2_ALLOCATE_MEMORY_CBK_NULL = -24,
+
+    ARGON2_INCORRECT_PARAMETER = -25,
+    ARGON2_INCORRECT_TYPE = -26,
+
+    ARGON2_OUT_PTR_MISMATCH = -27,
+
+    ARGON2_THREADS_TOO_FEW = -28,
+    ARGON2_THREADS_TOO_MANY = -29,
+
+    ARGON2_MISSING_ARGS = -30,
+
+    ARGON2_ENCODING_FAIL = -31,
+
+    ARGON2_DECODING_FAIL = -32,
+
+    ARGON2_THREAD_FAIL = -33,
+
+    ARGON2_DECODING_LENGTH_FAIL = -34,
+
+    ARGON2_VERIFY_MISMATCH = -35
+} argon2_error_codes;
+
+/* Memory allocator types --- for external allocation */
+typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate);
+typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate);
+
+/* Argon2 external data structures */
+
+/*
+ *****
+ * Context: structure to hold Argon2 inputs:
+ *  output array and its length,
+ *  password and its length,
+ *  salt and its length,
+ *  secret and its length,
+ *  associated data and its length,
+ *  number of passes, amount of used memory (in KBytes, can be rounded up a bit)
+ *  number of parallel threads that will be run.
+ * All the parameters above affect the output hash value.
+ * Additionally, two function pointers can be provided to allocate and
+ * deallocate the memory (if NULL, memory will be allocated internally).
+ * Also, three flags indicate whether to erase password, secret as soon as they
+ * are pre-hashed (and thus not needed anymore), and the entire memory
+ *****
+ * Simplest situation: you have output array out[8], password is stored in
+ * pwd[32], salt is stored in salt[16], you do not have keys nor associated
+ * data. You need to spend 1 GB of RAM and you run 5 passes of Argon2d with
+ * 4 parallel lanes.
+ * You want to erase the password, but you're OK with last pass not being
+ * erased. You want to use the default memory allocator.
+ * Then you initialize:
+ Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,4,NULL,NULL,true,false,false,false)
+ */
+typedef struct Argon2_Context {
+    uint8_t *out;    /* output array */
+    uint32_t outlen; /* digest length */
+
+    uint8_t *pwd;    /* password array */
+    uint32_t pwdlen; /* password length */
+
+    uint8_t *salt;    /* salt array */
+    uint32_t saltlen; /* salt length */
+
+    uint8_t *secret;    /* key array */
+    uint32_t secretlen; /* key length */
+
+    uint8_t *ad;    /* associated data array */
+    uint32_t adlen; /* associated data length */
+
+    uint32_t t_cost;  /* number of passes */
+    uint32_t m_cost;  /* amount of memory requested (KB) */
+    uint32_t lanes;   /* number of lanes */
+    uint32_t threads; /* maximum number of threads */
+
+    uint32_t version; /* version number */
+
+    allocate_fptr allocate_cbk; /* pointer to memory allocator */
+    deallocate_fptr free_cbk;   /* pointer to memory deallocator */
+
+    uint32_t flags; /* array of bool options */
+} argon2_context;
+
+/* Argon2 primitive type */
+typedef enum Argon2_type {
+  Argon2_d = 0,
+  Argon2_i = 1,
+  Argon2_id = 2
+} argon2_type;
+
+/* Version of the algorithm */
+typedef enum Argon2_version {
+    ARGON2_VERSION_10 = 0x10,
+    ARGON2_VERSION_13 = 0x13,
+    ARGON2_VERSION_NUMBER = ARGON2_VERSION_13
+} argon2_version;
+
+/*
+ * Function that performs memory-hard hashing with certain degree of parallelism
+ * @param  context  Pointer to the Argon2 internal structure
+ * @return Error code if smth is wrong, ARGON2_OK otherwise
+ */
+ARGON2_PUBLIC int cryptonite_argon2_ctx(argon2_context *context, argon2_type type);
+
+/**
+ * Hashes a password with Argon2i, producing a raw hash by allocating memory at
+ * @hash
+ * @param t_cost Number of iterations
+ * @param m_cost Sets memory usage to m_cost kibibytes
+ * @param parallelism Number of threads and compute lanes
+ * @param pwd Pointer to password
+ * @param pwdlen Password size in bytes
+ * @param salt Pointer to salt
+ * @param saltlen Salt size in bytes
+ * @param hash Buffer where to write the raw hash - updated by the function
+ * @param hashlen Desired length of the hash in bytes
+ * @pre   Different parallelism levels will give different results
+ * @pre   Returns ARGON2_OK if successful
+ */
+
+/* generic function underlying the above ones */
+ARGON2_PUBLIC int cryptonite_argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
+                              const uint32_t parallelism, const void *pwd,
+                              const size_t pwdlen, const void *salt,
+                              const size_t saltlen, void *hash,
+                              const size_t hashlen, argon2_type type,
+                              const uint32_t version);
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/bundled/cbits/argon2/blamka-round-opt.h b/bundled/cbits/argon2/blamka-round-opt.h
new file mode 100644
index 0000000..577e1d0
--- /dev/null
+++ b/bundled/cbits/argon2/blamka-round-opt.h
@@ -0,0 +1,180 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
+#endif
+
+#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
+#include <x86intrin.h>
+#endif
+
+#if !defined(__XOP__)
+#if defined(__SSSE3__)
+#define r16                                                                    \
+    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24                                                                    \
+    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define _mm_roti_epi64(x, c)                                                   \
+    (-(c) == 32)                                                               \
+        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
+        : (-(c) == 24)                                                         \
+              ? _mm_shuffle_epi8((x), r24)                                     \
+              : (-(c) == 16)                                                   \
+                    ? _mm_shuffle_epi8((x), r16)                               \
+                    : (-(c) == 63)                                             \
+                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_add_epi64((x), (x)))             \
+                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_slli_epi64((x), 64 - (-(c))))
+#else /* defined(__SSE2__) */
+#define _mm_roti_epi64(r, c)                                                   \
+    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#endif
+#else
+#endif
+
+static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
+    const __m128i z = _mm_mul_epu32(x, y);
+    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -32);                                          \
+        D1 = _mm_roti_epi64(D1, -32);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -24);                                          \
+        B1 = _mm_roti_epi64(B1, -24);                                          \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -16);                                          \
+        D1 = _mm_roti_epi64(D1, -16);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -63);                                          \
+        B1 = _mm_roti_epi64(B1, -63);                                          \
+    } while ((void)0, 0)
+
+#if defined(__SSSE3__)
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+#else /* SSE2 */
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = D0;                                                       \
+        __m128i t1 = B0;                                                       \
+        D0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = D0;                                                               \
+        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
+        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
+        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
+        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0, t1;                                                        \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+        t0 = B0;                                                               \
+        t1 = D0;                                                               \
+        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
+        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
+        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
+        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+#endif
+
+#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
+    do {                                                                       \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
+                                                                               \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
+    } while ((void)0, 0)
+
+#endif
diff --git a/bundled/cbits/argon2/blamka-round-ref.h b/bundled/cbits/argon2/blamka-round-ref.h
new file mode 100644
index 0000000..db9864d
--- /dev/null
+++ b/bundled/cbits/argon2/blamka-round-ref.h
@@ -0,0 +1,56 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_H
+#define BLAKE_ROUND_MKA_H
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+/*designed by the Lyra PHC team */
+static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
+    const uint64_t m = UINT64_C(0xFFFFFFFF);
+    const uint64_t xy = (x & m) * (y & m);
+    return x + y + 2 * xy;
+}
+
+#define G(a, b, c, d)                                                          \
+    do {                                                                       \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 32);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 24);                                                 \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 16);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 63);                                                 \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,   \
+                           v12, v13, v14, v15)                                 \
+    do {                                                                       \
+        G(v0, v4, v8, v12);                                                    \
+        G(v1, v5, v9, v13);                                                    \
+        G(v2, v6, v10, v14);                                                   \
+        G(v3, v7, v11, v15);                                                   \
+        G(v0, v5, v10, v15);                                                   \
+        G(v1, v6, v11, v12);                                                   \
+        G(v2, v7, v8, v13);                                                    \
+        G(v3, v4, v9, v14);                                                    \
+    } while ((void)0, 0)
+
+#endif
diff --git a/bundled/cbits/argon2/core.c b/bundled/cbits/argon2/core.c
new file mode 100644
index 0000000..cc22a74
--- /dev/null
+++ b/bundled/cbits/argon2/core.c
@@ -0,0 +1,701 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+/*For memory wiping*/
+#ifdef _MSC_VER
+#include <windows.h>
+#include <winbase.h> /* For SecureZeroMemory */
+#endif
+#if defined __STDC_LIB_EXT1__
+#define __STDC_WANT_LIB_EXT1__ 1
+#endif
+#define VC_GE_2005(version) (version >= 1400)
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core.h"
+#include "thread.c"
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#ifdef GENKAT
+#include "genkat.h"
+#endif
+
+#ifdef SUPPORT_SSE
+#include "opt.c"
+#else
+#include "ref.c"
+#endif
+
+#if defined(__clang__)
+#if __has_attribute(optnone)
+#define NOT_OPTIMIZED __attribute__((optnone))
+#endif
+#elif defined(__GNUC__)
+#define GCC_VERSION                                                            \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#if GCC_VERSION >= 40400
+#define NOT_OPTIMIZED __attribute__((optimize("O0")))
+#endif
+#endif
+#ifndef NOT_OPTIMIZED
+#define NOT_OPTIMIZED
+#endif
+
+/* Argon2 Team - Begin Code */
+static int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
+    uint8_t *out = (uint8_t *)pout;
+    blake2b_state blake_state;
+    uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
+    int ret = -1;
+
+    if (outlen > UINT32_MAX) {
+        goto fail;
+    }
+
+    /* Ensure little-endian byte order! */
+    store32(outlen_bytes, (uint32_t)outlen);
+
+#define TRY(statement)                                                         \
+    do {                                                                       \
+        ret = statement;                                                       \
+        if (ret < 0) {                                                         \
+            goto fail;                                                         \
+        }                                                                      \
+    } while ((void)0, 0)
+
+    if (outlen <= BLAKE2B_OUTBYTES) {
+        TRY(_cryptonite_blake2b_init(&blake_state, outlen));
+        TRY(_cryptonite_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+        TRY(_cryptonite_blake2b_update(&blake_state, in, inlen));
+        TRY(_cryptonite_blake2b_final(&blake_state, out, outlen));
+    } else {
+        uint32_t toproduce;
+        uint8_t out_buffer[BLAKE2B_OUTBYTES];
+        uint8_t in_buffer[BLAKE2B_OUTBYTES];
+        TRY(_cryptonite_blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
+        TRY(_cryptonite_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+        TRY(_cryptonite_blake2b_update(&blake_state, in, inlen));
+        TRY(_cryptonite_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
+        memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+        out += BLAKE2B_OUTBYTES / 2;
+        toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
+
+        while (toproduce > BLAKE2B_OUTBYTES) {
+            memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+            TRY(_cryptonite_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
+                        BLAKE2B_OUTBYTES, NULL, 0));
+            memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+            out += BLAKE2B_OUTBYTES / 2;
+            toproduce -= BLAKE2B_OUTBYTES / 2;
+        }
+
+        memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+        TRY(_cryptonite_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
+                    0));
+        memcpy(out, out_buffer, toproduce);
+    }
+fail:
+    clear_internal_memory(&blake_state, sizeof(blake_state));
+    return ret;
+#undef TRY
+}
+/* Argon2 Team - End Code */
+
+/***************Instance and Position constructors**********/
+static void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
+
+static void copy_block(block *dst, const block *src) {
+    memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
+}
+
+static void xor_block(block *dst, const block *src) {
+    int i;
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+        dst->v[i] ^= src->v[i];
+    }
+}
+
+static void load_block(block *dst, const void *input) {
+    unsigned i;
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+        dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
+    }
+}
+
+static void store_block(void *output, const block *src) {
+    unsigned i;
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+        store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
+    }
+}
+
+/***************Memory functions*****************/
+static
+int allocate_memory(const argon2_context *context, uint8_t **memory,
+                    size_t num, size_t size) {
+    size_t memory_size = num*size;
+    if (memory == NULL) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    /* 1. Check for multiplication overflow */
+    if (size != 0 && memory_size / size != num) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    /* 2. Try to allocate with appropriate allocator */
+    if (context->allocate_cbk) {
+        (context->allocate_cbk)(memory, memory_size);
+    } else {
+        *memory = malloc(memory_size);
+    }
+
+    if (*memory == NULL) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    return ARGON2_OK;
+}
+static
+void free_memory(const argon2_context *context, uint8_t *memory,
+                 size_t num, size_t size) {
+    size_t memory_size = num*size;
+    clear_internal_memory(memory, memory_size);
+    if (context->free_cbk) {
+        (context->free_cbk)(memory, memory_size);
+    } else {
+        free(memory);
+    }
+}
+
+void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
+#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER)
+    SecureZeroMemory(v, n);
+#elif defined memset_s
+    memset_s(v, n, 0, n);
+#elif defined(__OpenBSD__)
+    explicit_bzero(v, n);
+#else
+    static void *(*const volatile memset_sec)(void *, int, size_t) = &memset;
+    memset_sec(v, 0, n);
+#endif
+}
+
+/* Memory clear flag defaults to true. */
+static int FLAG_clear_internal_memory = 1;
+static void clear_internal_memory(void *v, size_t n) {
+  if (FLAG_clear_internal_memory && v) {
+    secure_wipe_memory(v, n);
+  }
+}
+
+static void finalize(const argon2_context *context, argon2_instance_t *instance) {
+    if (context != NULL && instance != NULL) {
+        block blockhash;
+        uint32_t l;
+
+        copy_block(&blockhash, instance->memory + instance->lane_length - 1);
+
+        /* XOR the last blocks */
+        for (l = 1; l < instance->lanes; ++l) {
+            uint32_t last_block_in_lane =
+                l * instance->lane_length + (instance->lane_length - 1);
+            xor_block(&blockhash, instance->memory + last_block_in_lane);
+        }
+
+        /* Hash the result */
+        {
+            uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
+            store_block(blockhash_bytes, &blockhash);
+            blake2b_long(context->out, context->outlen, blockhash_bytes,
+                         ARGON2_BLOCK_SIZE);
+            /* clear blockhash and blockhash_bytes */
+            clear_internal_memory(blockhash.v, ARGON2_BLOCK_SIZE);
+            clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
+        }
+
+#ifdef GENKAT
+        print_tag(context->out, context->outlen);
+#endif
+
+        free_memory(context, (uint8_t *)instance->memory,
+                    instance->memory_blocks, sizeof(block));
+    }
+}
+
+static uint32_t index_alpha(const argon2_instance_t *instance,
+                     const argon2_position_t *position, uint32_t pseudo_rand,
+                     int same_lane) {
+    /*
+     * Pass 0:
+     *      This lane : all already finished segments plus already constructed
+     * blocks in this segment
+     *      Other lanes : all already finished segments
+     * Pass 1+:
+     *      This lane : (SYNC_POINTS - 1) last segments plus already constructed
+     * blocks in this segment
+     *      Other lanes : (SYNC_POINTS - 1) last segments
+     */
+    uint32_t reference_area_size;
+    uint64_t relative_position;
+    uint32_t start_position, absolute_position;
+
+    if (0 == position->pass) {
+        /* First pass */
+        if (0 == position->slice) {
+            /* First slice */
+            reference_area_size =
+                position->index - 1; /* all but the previous */
+        } else {
+            if (same_lane) {
+                /* The same lane => add current segment */
+                reference_area_size =
+                    position->slice * instance->segment_length +
+                    position->index - 1;
+            } else {
+                reference_area_size =
+                    position->slice * instance->segment_length +
+                    ((position->index == 0) ? (-1) : 0);
+            }
+        }
+    } else {
+        /* Second pass */
+        if (same_lane) {
+            reference_area_size = instance->lane_length -
+                                  instance->segment_length + position->index -
+                                  1;
+        } else {
+            reference_area_size = instance->lane_length -
+                                  instance->segment_length +
+                                  ((position->index == 0) ? (-1) : 0);
+        }
+    }
+
+    /* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
+     * relative position */
+    relative_position = pseudo_rand;
+    relative_position = relative_position * relative_position >> 32;
+    relative_position = reference_area_size - 1 -
+                        (reference_area_size * relative_position >> 32);
+
+    /* 1.2.5 Computing starting position */
+    start_position = 0;
+
+    if (0 != position->pass) {
+        start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
+                             ? 0
+                             : (position->slice + 1) * instance->segment_length;
+    }
+
+    /* 1.2.6. Computing absolute position */
+    absolute_position = (start_position + relative_position) %
+                        instance->lane_length; /* absolute position */
+    return absolute_position;
+}
+
+/* Single-threaded version for p=1 case */
+static int fill_memory_blocks_st(argon2_instance_t *instance) {
+    uint32_t r, s, l;
+
+    for (r = 0; r < instance->passes; ++r) {
+        for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
+            for (l = 0; l < instance->lanes; ++l) {
+                argon2_position_t position = {r, l, (uint8_t)s, 0};
+                fill_segment(instance, position);
+            }
+        }
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
+    }
+    return ARGON2_OK;
+}
+
+#if !defined(ARGON2_NO_THREADS)
+
+#ifdef _WIN32
+static unsigned __stdcall fill_segment_thr(void *thread_data)
+#else
+static void *fill_segment_thr(void *thread_data)
+#endif
+{
+    argon2_thread_data *my_data = thread_data;
+    fill_segment(my_data->instance_ptr, my_data->pos);
+    argon2_thread_exit();
+    return 0;
+}
+
+/* Multi-threaded version for p > 1 case */
+static int fill_memory_blocks_mt(argon2_instance_t *instance) {
+    uint32_t r, s;
+    argon2_thread_handle_t *thread = NULL;
+    argon2_thread_data *thr_data = NULL;
+    int rc = ARGON2_OK;
+
+    /* 1. Allocating space for threads */
+    thread = calloc(instance->lanes, sizeof(argon2_thread_handle_t));
+    if (thread == NULL) {
+        rc = ARGON2_MEMORY_ALLOCATION_ERROR;
+        goto fail;
+    }
+
+    thr_data = calloc(instance->lanes, sizeof(argon2_thread_data));
+    if (thr_data == NULL) {
+        rc = ARGON2_MEMORY_ALLOCATION_ERROR;
+        goto fail;
+    }
+
+    for (r = 0; r < instance->passes; ++r) {
+        for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
+            uint32_t l;
+
+            /* 2. Calling threads */
+            for (l = 0; l < instance->lanes; ++l) {
+                argon2_position_t position;
+
+                /* 2.1 Join a thread if limit is exceeded */
+                if (l >= instance->threads) {
+                    if (argon2_thread_join(thread[l - instance->threads])) {
+                        rc = ARGON2_THREAD_FAIL;
+                        goto fail;
+                    }
+                }
+
+                /* 2.2 Create thread */
+                position.pass = r;
+                position.lane = l;
+                position.slice = (uint8_t)s;
+                position.index = 0;
+                thr_data[l].instance_ptr =
+                    instance; /* preparing the thread input */
+                memcpy(&(thr_data[l].pos), &position,
+                       sizeof(argon2_position_t));
+                if (argon2_thread_create(&thread[l], &fill_segment_thr,
+                                         (void *)&thr_data[l])) {
+                    rc = ARGON2_THREAD_FAIL;
+                    goto fail;
+                }
+
+                /* fill_segment(instance, position); */
+                /*Non-thread equivalent of the lines above */
+            }
+
+            /* 3. Joining remaining threads */
+            for (l = instance->lanes - instance->threads; l < instance->lanes;
+                 ++l) {
+                if (argon2_thread_join(thread[l])) {
+                    rc = ARGON2_THREAD_FAIL;
+                    goto fail;
+                }
+            }
+        }
+
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
+    }
+
+fail:
+    if (thread != NULL) {
+        free(thread);
+    }
+    if (thr_data != NULL) {
+        free(thr_data);
+    }
+    return rc;
+}
+
+#endif /* ARGON2_NO_THREADS */
+static
+int fill_memory_blocks(argon2_instance_t *instance) {
+	if (instance == NULL || instance->lanes == 0) {
+	    return ARGON2_INCORRECT_PARAMETER;
+    }
+#if defined(ARGON2_NO_THREADS)
+    return fill_memory_blocks_st(instance);
+#else
+    return instance->threads == 1 ?
+			fill_memory_blocks_st(instance) : fill_memory_blocks_mt(instance);
+#endif
+}
+static
+int validate_inputs(const argon2_context *context) {
+    if (NULL == context) {
+        return ARGON2_INCORRECT_PARAMETER;
+    }
+
+    if (NULL == context->out) {
+        return ARGON2_OUTPUT_PTR_NULL;
+    }
+
+    /* Validate output length */
+    if (ARGON2_MIN_OUTLEN > context->outlen) {
+        return ARGON2_OUTPUT_TOO_SHORT;
+    }
+
+    if (ARGON2_MAX_OUTLEN < context->outlen) {
+        return ARGON2_OUTPUT_TOO_LONG;
+    }
+
+    /* Validate password (required param) */
+    if (NULL == context->pwd) {
+        if (0 != context->pwdlen) {
+            return ARGON2_PWD_PTR_MISMATCH;
+        }
+    }
+
+    if (ARGON2_MIN_PWD_LENGTH > context->pwdlen) {
+      return ARGON2_PWD_TOO_SHORT;
+    }
+
+    if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) {
+        return ARGON2_PWD_TOO_LONG;
+    }
+
+    /* Validate salt (required param) */
+    if (NULL == context->salt) {
+        if (0 != context->saltlen) {
+            return ARGON2_SALT_PTR_MISMATCH;
+        }
+    }
+
+    if (ARGON2_MIN_SALT_LENGTH > context->saltlen) {
+        return ARGON2_SALT_TOO_SHORT;
+    }
+
+    if (ARGON2_MAX_SALT_LENGTH < context->saltlen) {
+        return ARGON2_SALT_TOO_LONG;
+    }
+
+    /* Validate secret (optional param) */
+    if (NULL == context->secret) {
+        if (0 != context->secretlen) {
+            return ARGON2_SECRET_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_SECRET > context->secretlen) {
+            return ARGON2_SECRET_TOO_SHORT;
+        }
+        if (ARGON2_MAX_SECRET < context->secretlen) {
+            return ARGON2_SECRET_TOO_LONG;
+        }
+    }
+
+    /* Validate associated data (optional param) */
+    if (NULL == context->ad) {
+        if (0 != context->adlen) {
+            return ARGON2_AD_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_AD_LENGTH > context->adlen) {
+            return ARGON2_AD_TOO_SHORT;
+        }
+        if (ARGON2_MAX_AD_LENGTH < context->adlen) {
+            return ARGON2_AD_TOO_LONG;
+        }
+    }
+
+    /* Validate memory cost */
+    if (ARGON2_MIN_MEMORY > context->m_cost) {
+        return ARGON2_MEMORY_TOO_LITTLE;
+    }
+
+    if (ARGON2_MAX_MEMORY < context->m_cost) {
+        return ARGON2_MEMORY_TOO_MUCH;
+    }
+
+    if (context->m_cost < 8 * context->lanes) {
+        return ARGON2_MEMORY_TOO_LITTLE;
+    }
+
+    /* Validate time cost */
+    if (ARGON2_MIN_TIME > context->t_cost) {
+        return ARGON2_TIME_TOO_SMALL;
+    }
+
+    if (ARGON2_MAX_TIME < context->t_cost) {
+        return ARGON2_TIME_TOO_LARGE;
+    }
+
+    /* Validate lanes */
+    if (ARGON2_MIN_LANES > context->lanes) {
+        return ARGON2_LANES_TOO_FEW;
+    }
+
+    if (ARGON2_MAX_LANES < context->lanes) {
+        return ARGON2_LANES_TOO_MANY;
+    }
+
+    /* Validate threads */
+    if (ARGON2_MIN_THREADS > context->threads) {
+        return ARGON2_THREADS_TOO_FEW;
+    }
+
+    if (ARGON2_MAX_THREADS < context->threads) {
+        return ARGON2_THREADS_TOO_MANY;
+    }
+
+    if (NULL != context->allocate_cbk && NULL == context->free_cbk) {
+        return ARGON2_FREE_MEMORY_CBK_NULL;
+    }
+
+    if (NULL == context->allocate_cbk && NULL != context->free_cbk) {
+        return ARGON2_ALLOCATE_MEMORY_CBK_NULL;
+    }
+
+    return ARGON2_OK;
+}
+static
+void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
+    uint32_t l;
+    /* Make the first and second block in each lane as G(H0||i||0) or
+       G(H0||i||1) */
+    uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
+    for (l = 0; l < instance->lanes; ++l) {
+
+        store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
+        store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l);
+        blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,
+                     ARGON2_PREHASH_SEED_LENGTH);
+        load_block(&instance->memory[l * instance->lane_length + 0],
+                   blockhash_bytes);
+
+        store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
+        blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,
+                     ARGON2_PREHASH_SEED_LENGTH);
+        load_block(&instance->memory[l * instance->lane_length + 1],
+                   blockhash_bytes);
+    }
+    clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
+}
+static
+void initial_hash(uint8_t *blockhash, argon2_context *context,
+                  argon2_type type) {
+    blake2b_state BlakeHash;
+    uint8_t value[sizeof(uint32_t)];
+
+    if (NULL == context || NULL == blockhash) {
+        return;
+    }
+
+    _cryptonite_blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH);
+
+    store32(&value, context->lanes);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->outlen);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->m_cost);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->t_cost);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->version);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, (uint32_t)type);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->pwdlen);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->pwd != NULL) {
+        _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
+                       context->pwdlen);
+
+        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
+            secure_wipe_memory(context->pwd, context->pwdlen);
+            context->pwdlen = 0;
+        }
+    }
+
+    store32(&value, context->saltlen);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->salt != NULL) {
+        _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->salt,
+                       context->saltlen);
+    }
+
+    store32(&value, context->secretlen);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->secret != NULL) {
+        _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->secret,
+                       context->secretlen);
+
+        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
+            secure_wipe_memory(context->secret, context->secretlen);
+            context->secretlen = 0;
+        }
+    }
+
+    store32(&value, context->adlen);
+    _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->ad != NULL) {
+        _cryptonite_blake2b_update(&BlakeHash, (const uint8_t *)context->ad,
+                       context->adlen);
+    }
+
+    _cryptonite_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+}
+static
+int initialize(argon2_instance_t *instance, argon2_context *context) {
+    uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
+    int result = ARGON2_OK;
+
+    if (instance == NULL || context == NULL)
+        return ARGON2_INCORRECT_PARAMETER;
+    instance->context_ptr = context;
+
+    /* 1. Memory allocation */
+    result = allocate_memory(context, (uint8_t **)&(instance->memory),
+                             instance->memory_blocks, sizeof(block));
+    if (result != ARGON2_OK) {
+        return result;
+    }
+
+    /* 2. Initial hashing */
+    /* H_0 + 8 extra bytes to produce the first blocks */
+    /* uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; */
+    /* Hashing all inputs */
+    initial_hash(blockhash, context, instance->type);
+    /* Zeroing 8 extra bytes */
+    clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
+                          ARGON2_PREHASH_SEED_LENGTH -
+                              ARGON2_PREHASH_DIGEST_LENGTH);
+
+#ifdef GENKAT
+    initial_kat(blockhash, context, instance->type);
+#endif
+
+    /* 3. Creating first blocks, we always have at least two blocks in a slice
+     */
+    fill_first_blocks(blockhash, instance);
+    /* Clearing the hash */
+    clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
+
+    return ARGON2_OK;
+}
diff --git a/bundled/cbits/argon2/core.h b/bundled/cbits/argon2/core.h
new file mode 100644
index 0000000..888cc3c
--- /dev/null
+++ b/bundled/cbits/argon2/core.h
@@ -0,0 +1,234 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_CORE_H
+#define ARGON2_CORE_H
+
+#include "argon2.h"
+
+#if defined(_MSC_VER)
+#define ALIGN(n) __declspec(align(16))
+#elif defined(__GNUC__) || defined(__clang)
+#define ALIGN(x) __attribute__((__aligned__(x)))
+#else
+#define ALIGN(x)
+#endif
+
+#define CONST_CAST(x) (x)(uintptr_t)
+
+/**********************Argon2 internal constants*******************************/
+
+enum argon2_core_constants {
+    /* Memory block size in bytes */
+    ARGON2_BLOCK_SIZE = 1024,
+    ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
+    ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16,
+
+    /* Number of pseudo-random values generated by one call to Blake in Argon2i
+       to
+       generate reference block positions */
+    ARGON2_ADDRESSES_IN_BLOCK = 128,
+
+    /* Pre-hashing digest length and its extension*/
+    ARGON2_PREHASH_DIGEST_LENGTH = 64,
+    ARGON2_PREHASH_SEED_LENGTH = 72
+};
+
+/*************************Argon2 internal data types***********************/
+
+/*
+ * Structure for the (1KB) memory block implemented as 128 64-bit words.
+ * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no
+ * bounds checking).
+ */
+typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block;
+
+/*****************Functions that work with the block******************/
+
+/* Initialize each byte of the block with @in */
+static void init_block_value(block *b, uint8_t in);
+
+/* Copy block @src to block @dst */
+static void copy_block(block *dst, const block *src);
+
+/* XOR @src onto @dst bytewise */
+static void xor_block(block *dst, const block *src);
+
+/*
+ * Argon2 instance: memory pointer, number of passes, amount of memory, type,
+ * and derived values.
+ * Used to evaluate the number and location of blocks to construct in each
+ * thread
+ */
+typedef struct Argon2_instance_t {
+    block *memory;          /* Memory pointer */
+    uint32_t version;
+    uint32_t passes;        /* Number of passes */
+    uint32_t memory_blocks; /* Number of blocks in memory */
+    uint32_t segment_length;
+    uint32_t lane_length;
+    uint32_t lanes;
+    uint32_t threads;
+    argon2_type type;
+    int print_internals; /* whether to print the memory blocks */
+    argon2_context *context_ptr; /* points back to original context */
+} argon2_instance_t;
+
+/*
+ * Argon2 position: where we construct the block right now. Used to distribute
+ * work between threads.
+ */
+typedef struct Argon2_position_t {
+    uint32_t pass;
+    uint32_t lane;
+    uint8_t slice;
+    uint32_t index;
+} argon2_position_t;
+
+/*Struct that holds the inputs for thread handling FillSegment*/
+typedef struct Argon2_thread_data {
+    argon2_instance_t *instance_ptr;
+    argon2_position_t pos;
+} argon2_thread_data;
+
+/*************************Argon2 core functions********************************/
+
+/* Allocates memory to the given pointer, uses the appropriate allocator as
+ * specified in the context. Total allocated memory is num*size.
+ * @param context argon2_context which specifies the allocator
+ * @param memory pointer to the pointer to the memory
+ * @param size the size in bytes for each element to be allocated
+ * @param num the number of elements to be allocated
+ * @return ARGON2_OK if @memory is a valid pointer and memory is allocated
+ */
+static int allocate_memory(const argon2_context *context, uint8_t **memory,
+                    size_t num, size_t size);
+
+/*
+ * Frees memory at the given pointer, uses the appropriate deallocator as
+ * specified in the context. Also cleans the memory using clear_internal_memory.
+ * @param context argon2_context which specifies the deallocator
+ * @param memory pointer to buffer to be freed
+ * @param size the size in bytes for each element to be deallocated
+ * @param num the number of elements to be deallocated
+ */
+static void free_memory(const argon2_context *context, uint8_t *memory,
+                 size_t num, size_t size);
+
+/* Function that securely cleans the memory. This ignores any flags set
+ * regarding clearing memory. Usually one just calls clear_internal_memory.
+ * @param mem Pointer to the memory
+ * @param s Memory size in bytes
+ */
+static void secure_wipe_memory(void *v, size_t n);
+
+/* Function that securely clears the memory if FLAG_clear_internal_memory is
+ * set. If the flag isn't set, this function does nothing.
+ * @param mem Pointer to the memory
+ * @param s Memory size in bytes
+ */
+static void clear_internal_memory(void *v, size_t n);
+
+/*
+ * Computes absolute position of reference block in the lane following a skewed
+ * distribution and using a pseudo-random value as input
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rand 32-bit pseudo-random value used to determine the position
+ * @param same_lane Indicates if the block will be taken from the current lane.
+ * If so we can reference the current segment
+ * @pre All pointers must be valid
+ */
+static uint32_t index_alpha(const argon2_instance_t *instance,
+                     const argon2_position_t *position, uint32_t pseudo_rand,
+                     int same_lane);
+
+/*
+ * Function that validates all inputs against predefined restrictions and return
+ * an error code
+ * @param context Pointer to current Argon2 context
+ * @return ARGON2_OK if everything is all right, otherwise one of error codes
+ * (all defined in <argon2.h>
+ */
+static int validate_inputs(const argon2_context *context);
+
+/*
+ * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
+ * password and secret if needed
+ * @param  context  Pointer to the Argon2 internal structure containing memory
+ * pointer, and parameters for time and space requirements.
+ * @param  blockhash Buffer for pre-hashing digest
+ * @param  type Argon2 type
+ * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
+ * allocated
+ */
+static void initial_hash(uint8_t *blockhash, argon2_context *context,
+                  argon2_type type);
+
+/*
+ * Function creates first 2 blocks per lane
+ * @param instance Pointer to the current instance
+ * @param blockhash Pointer to the pre-hashing digest
+ * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
+ */
+static void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
+
+/*
+ * Function allocates memory, hashes the inputs with Blake,  and creates first
+ * two blocks. Returns the pointer to the main memory with 2 blocks per lane
+ * initialized
+ * @param  context  Pointer to the Argon2 internal structure containing memory
+ * pointer, and parameters for time and space requirements.
+ * @param  instance Current Argon2 instance
+ * @return Zero if successful, -1 if memory failed to allocate. @context->state
+ * will be modified if successful.
+ */
+static int initialize(argon2_instance_t *instance, argon2_context *context);
+
+/*
+ * XORing the last block of each lane, hashing it, making the tag. Deallocates
+ * the memory.
+ * @param context Pointer to current Argon2 context (use only the out parameters
+ * from it)
+ * @param instance Pointer to current instance of Argon2
+ * @pre instance->state must point to necessary amount of memory
+ * @pre context->out must point to outlen bytes of memory
+ * @pre if context->free_cbk is not NULL, it should point to a function that
+ * deallocates memory
+ */
+static void finalize(const argon2_context *context, argon2_instance_t *instance);
+
+/*
+ * Function that fills the segment using previous segments also from other
+ * threads
+ * @param context current context
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+static void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position);
+
+/*
+ * Function that fills the entire memory t_cost times based on the first two
+ * blocks in each lane
+ * @param instance Pointer to the current instance
+ * @return ARGON2_OK if successful, @context->state
+ */
+static int fill_memory_blocks(argon2_instance_t *instance);
+
+#endif
diff --git a/bundled/cbits/argon2/opt.c b/bundled/cbits/argon2/opt.c
new file mode 100644
index 0000000..0102bdc
--- /dev/null
+++ b/bundled/cbits/argon2/opt.c
@@ -0,0 +1,186 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+#include "opt.h"
+
+#include "blake2/blake2.h"
+#include "blake2/blamka-round-opt.h"
+
+static void fill_block(__m128i *state, const block *ref_block, block *next_block,
+                int with_xor) {
+    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+            state[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+            block_XY[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+        }
+    }
+
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
+            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
+            state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
+            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
+            state[8 * 6 + i], state[8 * 7 + i]);
+    }
+
+    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+        state[i] = _mm_xor_si128(state[i], block_XY[i]);
+        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
+    }
+}
+
+static void next_addresses(block *address_block, block *input_block) {
+    /*Temporary zero-initialized blocks*/
+    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
+    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+
+    memset(zero_block, 0, sizeof(zero_block));
+    memset(zero2_block, 0, sizeof(zero2_block));
+
+    /*Increasing index counter*/
+    input_block->v[6]++;
+
+    /*First iteration of G*/
+    fill_block(zero_block, input_block, address_block, 0);
+
+    /*Second iteration of G*/
+    fill_block(zero2_block, address_block, address_block, 0);
+}
+
+static void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position) {
+    block *ref_block = NULL, *curr_block = NULL;
+    block address_block, input_block;
+    uint64_t pseudo_rand, ref_index, ref_lane;
+    uint32_t prev_offset, curr_offset;
+    uint32_t starting_index, i;
+    __m128i state[64];
+    int data_independent_addressing;
+
+    if (instance == NULL) {
+        return;
+    }
+
+    data_independent_addressing =
+        (instance->type == Argon2_i) ||
+        (instance->type == Argon2_id && (position.pass == 0) &&
+         (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    if (data_independent_addressing) {
+        init_block_value(&input_block, 0);
+
+        input_block.v[0] = position.pass;
+        input_block.v[1] = position.lane;
+        input_block.v[2] = position.slice;
+        input_block.v[3] = instance->memory_blocks;
+        input_block.v[4] = instance->passes;
+        input_block.v[5] = instance->type;
+    }
+
+    starting_index = 0;
+
+    if ((0 == position.pass) && (0 == position.slice)) {
+        starting_index = 2; /* we have already generated the first two blocks */
+
+        /* Don't forget to generate the first block of addresses: */
+        if (data_independent_addressing) {
+            next_addresses(&address_block, &input_block);
+        }
+    }
+
+    /* Offset of the current block */
+    curr_offset = position.lane * instance->lane_length +
+                  position.slice * instance->segment_length + starting_index;
+
+    if (0 == curr_offset % instance->lane_length) {
+        /* Last block in this lane */
+        prev_offset = curr_offset + instance->lane_length - 1;
+    } else {
+        /* Previous block */
+        prev_offset = curr_offset - 1;
+    }
+
+    memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+
+    for (i = starting_index; i < instance->segment_length;
+         ++i, ++curr_offset, ++prev_offset) {
+        /*1.1 Rotating prev_offset if needed */
+        if (curr_offset % instance->lane_length == 1) {
+            prev_offset = curr_offset - 1;
+        }
+
+        /* 1.2 Computing the index of the reference block */
+        /* 1.2.1 Taking pseudo-random value from the previous block */
+        if (data_independent_addressing) {
+            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+                next_addresses(&address_block, &input_block);
+            }
+            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+        } else {
+            pseudo_rand = instance->memory[prev_offset].v[0];
+        }
+
+        /* 1.2.2 Computing the lane of the reference block */
+        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+
+        if ((position.pass == 0) && (position.slice == 0)) {
+            /* Can not reference other lanes yet */
+            ref_lane = position.lane;
+        }
+
+        /* 1.2.3 Computing the number of possible reference block within the
+         * lane.
+         */
+        position.index = i;
+        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+                                ref_lane == position.lane);
+
+        /* 2 Creating a new block */
+        ref_block =
+            instance->memory + instance->lane_length * ref_lane + ref_index;
+        curr_block = instance->memory + curr_offset;
+        if (ARGON2_VERSION_10 == instance->version) {
+            /* version 1.2.1 and earlier: overwrite, not XOR */
+            fill_block(state, ref_block, curr_block, 0);
+        } else {
+            if(0 == position.pass) {
+                fill_block(state, ref_block, curr_block, 0);
+            } else {
+                fill_block(state, ref_block, curr_block, 1);
+            }
+        }
+    }
+}
diff --git a/bundled/cbits/argon2/opt.h b/bundled/cbits/argon2/opt.h
new file mode 100644
index 0000000..825bbfc
--- /dev/null
+++ b/bundled/cbits/argon2/opt.h
@@ -0,0 +1,35 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_OPT_H
+#define ARGON2_OPT_H
+
+#include "core.h"
+#include <emmintrin.h>
+
+/*
+ * Function fills a new memory block and optionally XORs the old block over the new one.
+ * Memory must be initialized.
+ * @param state Pointer to the just produced block. Content will be updated(!)
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be XORed over. May coincide with @ref_block
+ * @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
+ * @pre all block pointers must be valid
+ */
+static void fill_block(__m128i *s, const block *ref_block, block *next_block, int with_xor);
+
+#endif /* ARGON2_OPT_H */
diff --git a/bundled/cbits/argon2/ref.c b/bundled/cbits/argon2/ref.c
new file mode 100644
index 0000000..f30e294
--- /dev/null
+++ b/bundled/cbits/argon2/ref.c
@@ -0,0 +1,185 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+#include "ref.h"
+
+#include "blamka-round-ref.h"
+#include "blake2-impl.h"
+#include "blake2.h"
+
+
+static void fill_block(const block *prev_block, const block *ref_block,
+                block *next_block, int with_xor) {
+    block blockR, block_tmp;
+    unsigned i;
+
+    copy_block(&blockR, ref_block);
+    xor_block(&blockR, prev_block);
+    copy_block(&block_tmp, &blockR);
+    /* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */
+    if (with_xor) {
+        /* Saving the next block contents for XOR over: */
+        xor_block(&block_tmp, next_block);
+        /* Now blockR = ref_block + prev_block and
+           block_tmp = ref_block + prev_block + next_block */
+    }
+
+    /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then
+       (16,17,..31)... finally (112,113,...127) */
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND_NOMSG(
+            blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
+            blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
+            blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
+            blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
+            blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
+            blockR.v[16 * i + 15]);
+    }
+
+    /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then
+       (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */
+    for (i = 0; i < 8; i++) {
+        BLAKE2_ROUND_NOMSG(
+            blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
+            blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
+            blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
+            blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
+            blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
+            blockR.v[2 * i + 113]);
+    }
+
+    copy_block(next_block, &block_tmp);
+    xor_block(next_block, &blockR);
+}
+
+static void next_addresses(block *address_block, block *input_block,
+                           const block *zero_block) {
+    input_block->v[6]++;
+    fill_block(zero_block, input_block, address_block, 0);
+    fill_block(zero_block, address_block, address_block, 0);
+}
+
+static void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position) {
+    block *ref_block = NULL, *curr_block = NULL;
+    block address_block, input_block, zero_block;
+    uint64_t pseudo_rand, ref_index, ref_lane;
+    uint32_t prev_offset, curr_offset;
+    uint32_t starting_index;
+    uint32_t i;
+    int data_independent_addressing;
+
+    if (instance == NULL) {
+        return;
+    }
+
+    data_independent_addressing =
+        (instance->type == Argon2_i) ||
+        (instance->type == Argon2_id && (position.pass == 0) &&
+         (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    if (data_independent_addressing) {
+        init_block_value(&zero_block, 0);
+        init_block_value(&input_block, 0);
+
+        input_block.v[0] = position.pass;
+        input_block.v[1] = position.lane;
+        input_block.v[2] = position.slice;
+        input_block.v[3] = instance->memory_blocks;
+        input_block.v[4] = instance->passes;
+        input_block.v[5] = instance->type;
+    }
+
+    starting_index = 0;
+
+    if ((0 == position.pass) && (0 == position.slice)) {
+        starting_index = 2; /* we have already generated the first two blocks */
+
+        /* Don't forget to generate the first block of addresses: */
+        if (data_independent_addressing) {
+            next_addresses(&address_block, &input_block, &zero_block);
+        }
+    }
+
+    /* Offset of the current block */
+    curr_offset = position.lane * instance->lane_length +
+                  position.slice * instance->segment_length + starting_index;
+
+    if (0 == curr_offset % instance->lane_length) {
+        /* Last block in this lane */
+        prev_offset = curr_offset + instance->lane_length - 1;
+    } else {
+        /* Previous block */
+        prev_offset = curr_offset - 1;
+    }
+
+    for (i = starting_index; i < instance->segment_length;
+         ++i, ++curr_offset, ++prev_offset) {
+        /*1.1 Rotating prev_offset if needed */
+        if (curr_offset % instance->lane_length == 1) {
+            prev_offset = curr_offset - 1;
+        }
+
+        /* 1.2 Computing the index of the reference block */
+        /* 1.2.1 Taking pseudo-random value from the previous block */
+        if (data_independent_addressing) {
+            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+                next_addresses(&address_block, &input_block, &zero_block);
+            }
+            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+        } else {
+            pseudo_rand = instance->memory[prev_offset].v[0];
+        }
+
+        /* 1.2.2 Computing the lane of the reference block */
+        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+
+        if ((position.pass == 0) && (position.slice == 0)) {
+            /* Can not reference other lanes yet */
+            ref_lane = position.lane;
+        }
+
+        /* 1.2.3 Computing the number of possible reference block within the
+         * lane.
+         */
+        position.index = i;
+        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+                                ref_lane == position.lane);
+
+        /* 2 Creating a new block */
+        ref_block =
+            instance->memory + instance->lane_length * ref_lane + ref_index;
+        curr_block = instance->memory + curr_offset;
+        if (ARGON2_VERSION_10 == instance->version) {
+            /* version 1.2.1 and earlier: overwrite, not XOR */
+            fill_block(instance->memory + prev_offset, ref_block, curr_block, 0);
+        } else {
+            if(0 == position.pass) {
+                fill_block(instance->memory + prev_offset, ref_block,
+                           curr_block, 0);
+            } else {
+                fill_block(instance->memory + prev_offset, ref_block,
+                           curr_block, 1);
+            }
+        }
+    }
+}
diff --git a/bundled/cbits/argon2/ref.h b/bundled/cbits/argon2/ref.h
new file mode 100644
index 0000000..676b454
--- /dev/null
+++ b/bundled/cbits/argon2/ref.h
@@ -0,0 +1,35 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_REF_H
+#define ARGON2_REF_H
+
+#include "core.h"
+
+/*
+ * Function fills a new memory block and optionally XORs the old block over the new one.
+ * @next_block must be initialized.
+ * @param prev_block Pointer to the previous block
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be constructed
+ * @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
+ * @pre all block pointers must be valid
+ */
+static void fill_block(const block *prev_block, const block *ref_block,
+                block *next_block, int with_xor);
+
+#endif /* ARGON2_REF_H */
diff --git a/bundled/cbits/argon2/thread.c b/bundled/cbits/argon2/thread.c
new file mode 100644
index 0000000..1686481
--- /dev/null
+++ b/bundled/cbits/argon2/thread.c
@@ -0,0 +1,57 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#if !defined(ARGON2_NO_THREADS)
+
+#include "thread.h"
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+static int argon2_thread_create(argon2_thread_handle_t *handle,
+                         argon2_thread_func_t func, void *args) {
+    if (NULL == handle || func == NULL) {
+        return -1;
+    }
+#if defined(_WIN32)
+    *handle = _beginthreadex(NULL, 0, func, args, 0, NULL);
+    return *handle != 0 ? 0 : -1;
+#else
+    return pthread_create(handle, NULL, func, args);
+#endif
+}
+
+static int argon2_thread_join(argon2_thread_handle_t handle) {
+#if defined(_WIN32)
+    if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) {
+        return CloseHandle((HANDLE)handle) != 0 ? 0 : -1;
+    }
+    return -1;
+#else
+    return pthread_join(handle, NULL);
+#endif
+}
+
+static void argon2_thread_exit(void) {
+#if defined(_WIN32)
+    _endthreadex(0);
+#else
+    pthread_exit(NULL);
+#endif
+}
+
+#endif /* ARGON2_NO_THREADS */
diff --git a/bundled/cbits/argon2/thread.h b/bundled/cbits/argon2/thread.h
new file mode 100644
index 0000000..d91882c
--- /dev/null
+++ b/bundled/cbits/argon2/thread.h
@@ -0,0 +1,67 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_THREAD_H
+#define ARGON2_THREAD_H
+
+#if !defined(ARGON2_NO_THREADS)
+
+/*
+        Here we implement an abstraction layer for the simpĺe requirements
+        of the Argon2 code. We only require 3 primitives---thread creation,
+        joining, and termination---so full emulation of the pthreads API
+        is unwarranted. Currently we wrap pthreads and Win32 threads.
+
+        The API defines 2 types: the function pointer type,
+   argon2_thread_func_t,
+        and the type of the thread handle---argon2_thread_handle_t.
+*/
+#if defined(_WIN32)
+#include <process.h>
+typedef unsigned(__stdcall *argon2_thread_func_t)(void *);
+typedef uintptr_t argon2_thread_handle_t;
+#else
+#include <pthread.h>
+typedef void *(*argon2_thread_func_t)(void *);
+typedef pthread_t argon2_thread_handle_t;
+#endif
+
+/* Creates a thread
+ * @param handle pointer to a thread handle, which is the output of this
+ * function. Must not be NULL.
+ * @param func A function pointer for the thread's entry point. Must not be
+ * NULL.
+ * @param args Pointer that is passed as an argument to @func. May be NULL.
+ * @return 0 if @handle and @func are valid pointers and a thread is successfuly
+ * created.
+ */
+static int argon2_thread_create(argon2_thread_handle_t *handle,
+                         argon2_thread_func_t func, void *args);
+
+/* Waits for a thread to terminate
+ * @param handle Handle to a thread created with argon2_thread_create.
+ * @return 0 if @handle is a valid handle, and joining completed successfully.
+*/
+static int argon2_thread_join(argon2_thread_handle_t handle);
+
+/* Terminate the current thread. Must be run inside a thread created by
+ * argon2_thread_create.
+*/
+static void argon2_thread_exit(void);
+
+#endif /* ARGON2_NO_THREADS */
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2-config.h b/bundled/cbits/blake2/sse/blake2-config.h
new file mode 100644
index 0000000..a524aa9
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2-config.h
@@ -0,0 +1,72 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2_CONFIG_H
+#define BLAKE2_CONFIG_H
+
+/* These don't work everywhere */
+#if defined(__SSE2__) || defined(__x86_64__) || defined(__amd64__)
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSSE3__)
+#define HAVE_SSSE3
+#endif
+
+#if defined(__SSE4_1__)
+#define HAVE_SSE41
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__XOP__)
+#define HAVE_XOP
+#endif
+
+
+#ifdef HAVE_AVX2
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_XOP
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_AVX
+#ifndef HAVE_SSE41
+#define HAVE_SSE41
+#endif
+#endif
+
+#ifdef HAVE_SSE41
+#ifndef HAVE_SSSE3
+#define HAVE_SSSE3
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSE2
+#endif
+
+#if !defined(HAVE_SSE2)
+#error "This code requires at least SSE2."
+#endif
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2-impl.h b/bundled/cbits/blake2/sse/blake2-impl.h
new file mode 100644
index 0000000..c1df82e
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2-impl.h
@@ -0,0 +1,160 @@
+/*
+   BLAKE2 reference source code package - reference C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2_IMPL_H
+#define BLAKE2_IMPL_H
+
+#include <stdint.h>
+#include <string.h>
+
+#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L)
+  #if   defined(_MSC_VER)
+    #define BLAKE2_INLINE __inline
+  #elif defined(__GNUC__)
+    #define BLAKE2_INLINE __inline__
+  #else
+    #define BLAKE2_INLINE
+  #endif
+#else
+  #define BLAKE2_INLINE inline
+#endif
+
+static BLAKE2_INLINE uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  return (( uint32_t )( p[0] ) <<  0) |
+         (( uint32_t )( p[1] ) <<  8) |
+         (( uint32_t )( p[2] ) << 16) |
+         (( uint32_t )( p[3] ) << 24) ;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  return (( uint64_t )( p[0] ) <<  0) |
+         (( uint64_t )( p[1] ) <<  8) |
+         (( uint64_t )( p[2] ) << 16) |
+         (( uint64_t )( p[3] ) << 24) |
+         (( uint64_t )( p[4] ) << 32) |
+         (( uint64_t )( p[5] ) << 40) |
+         (( uint64_t )( p[6] ) << 48) |
+         (( uint64_t )( p[7] ) << 56) ;
+#endif
+}
+
+static BLAKE2_INLINE uint16_t load16( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint16_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  return ( uint16_t )((( uint32_t )( p[0] ) <<  0) |
+                      (( uint32_t )( p[1] ) <<  8));
+#endif
+}
+
+static BLAKE2_INLINE void store16( void *dst, uint16_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static BLAKE2_INLINE void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  p[0] = (uint8_t)(w >>  0);
+  p[1] = (uint8_t)(w >>  8);
+  p[2] = (uint8_t)(w >> 16);
+  p[3] = (uint8_t)(w >> 24);
+#endif
+}
+
+static BLAKE2_INLINE void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  p[0] = (uint8_t)(w >>  0);
+  p[1] = (uint8_t)(w >>  8);
+  p[2] = (uint8_t)(w >> 16);
+  p[3] = (uint8_t)(w >> 24);
+  p[4] = (uint8_t)(w >> 32);
+  p[5] = (uint8_t)(w >> 40);
+  p[6] = (uint8_t)(w >> 48);
+  p[7] = (uint8_t)(w >> 56);
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load48( const void *src )
+{
+  const uint8_t *p = ( const uint8_t * )src;
+  return (( uint64_t )( p[0] ) <<  0) |
+         (( uint64_t )( p[1] ) <<  8) |
+         (( uint64_t )( p[2] ) << 16) |
+         (( uint64_t )( p[3] ) << 24) |
+         (( uint64_t )( p[4] ) << 32) |
+         (( uint64_t )( p[5] ) << 40) ;
+}
+
+static BLAKE2_INLINE void store48( void *dst, uint64_t w )
+{
+  uint8_t *p = ( uint8_t * )dst;
+  p[0] = (uint8_t)(w >>  0);
+  p[1] = (uint8_t)(w >>  8);
+  p[2] = (uint8_t)(w >> 16);
+  p[3] = (uint8_t)(w >> 24);
+  p[4] = (uint8_t)(w >> 32);
+  p[5] = (uint8_t)(w >> 40);
+}
+
+static BLAKE2_INLINE uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static BLAKE2_INLINE uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static BLAKE2_INLINE void secure_zero_memory(void *v, size_t n)
+{
+  static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
+  memset_v(v, 0, n);
+}
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2.h b/bundled/cbits/blake2/sse/blake2.h
new file mode 100644
index 0000000..d69b087
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2.h
@@ -0,0 +1,195 @@
+/*
+   BLAKE2 reference source code package - reference C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2_H
+#define BLAKE2_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#define BLAKE2_PACKED(x) __pragma(pack(push, 1)) x __pragma(pack(pop))
+#else
+#define BLAKE2_PACKED(x) x __attribute__((packed))
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  enum blake2s_constant
+  {
+    BLAKE2S_BLOCKBYTES = 64,
+    BLAKE2S_OUTBYTES   = 32,
+    BLAKE2S_KEYBYTES   = 32,
+    BLAKE2S_SALTBYTES  = 8,
+    BLAKE2S_PERSONALBYTES = 8
+  };
+
+  enum blake2b_constant
+  {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES   = 64,
+    BLAKE2B_KEYBYTES   = 64,
+    BLAKE2B_SALTBYTES  = 16,
+    BLAKE2B_PERSONALBYTES = 16
+  };
+
+  typedef struct blake2s_state__
+  {
+    uint32_t h[8];
+    uint32_t t[2];
+    uint32_t f[2];
+    uint8_t  buf[BLAKE2S_BLOCKBYTES];
+    size_t   buflen;
+    size_t   outlen;
+    uint8_t  last_node;
+  } blake2s_state;
+
+  typedef struct blake2b_state__
+  {
+    uint64_t h[8];
+    uint64_t t[2];
+    uint64_t f[2];
+    uint8_t  buf[BLAKE2B_BLOCKBYTES];
+    size_t   buflen;
+    size_t   outlen;
+    uint8_t  last_node;
+  } blake2b_state;
+
+  typedef struct blake2sp_state__
+  {
+    blake2s_state S[8][1];
+    blake2s_state R[1];
+    uint8_t       buf[8 * BLAKE2S_BLOCKBYTES];
+    size_t        buflen;
+    size_t        outlen;
+  } blake2sp_state;
+
+  typedef struct blake2bp_state__
+  {
+    blake2b_state S[4][1];
+    blake2b_state R[1];
+    uint8_t       buf[4 * BLAKE2B_BLOCKBYTES];
+    size_t        buflen;
+    size_t        outlen;
+  } blake2bp_state;
+
+
+  BLAKE2_PACKED(struct blake2s_param__
+  {
+    uint8_t  digest_length; /* 1 */
+    uint8_t  key_length;    /* 2 */
+    uint8_t  fanout;        /* 3 */
+    uint8_t  depth;         /* 4 */
+    uint32_t leaf_length;   /* 8 */
+    uint32_t node_offset;  /* 12 */
+    uint16_t xof_length;    /* 14 */
+    uint8_t  node_depth;    /* 15 */
+    uint8_t  inner_length;  /* 16 */
+    /* uint8_t  reserved[0]; */
+    uint8_t  salt[BLAKE2S_SALTBYTES]; /* 24 */
+    uint8_t  personal[BLAKE2S_PERSONALBYTES];  /* 32 */
+  });
+
+  typedef struct blake2s_param__ blake2s_param;
+
+  BLAKE2_PACKED(struct blake2b_param__
+  {
+    uint8_t  digest_length; /* 1 */
+    uint8_t  key_length;    /* 2 */
+    uint8_t  fanout;        /* 3 */
+    uint8_t  depth;         /* 4 */
+    uint32_t leaf_length;   /* 8 */
+    uint32_t node_offset;   /* 12 */
+    uint32_t xof_length;    /* 16 */
+    uint8_t  node_depth;    /* 17 */
+    uint8_t  inner_length;  /* 18 */
+    uint8_t  reserved[14];  /* 32 */
+    uint8_t  salt[BLAKE2B_SALTBYTES]; /* 48 */
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  /* 64 */
+  });
+
+  typedef struct blake2b_param__ blake2b_param;
+
+  typedef struct blake2xs_state__
+  {
+    blake2s_state S[1];
+    blake2s_param P[1];
+  } blake2xs_state;
+
+  typedef struct blake2xb_state__
+  {
+    blake2b_state S[1];
+    blake2b_param P[1];
+  } blake2xb_state;
+
+  /* Padded structs result in a compile-time error */
+  enum {
+    BLAKE2_DUMMY_1 = 1/(sizeof(blake2s_param) == BLAKE2S_OUTBYTES),
+    BLAKE2_DUMMY_2 = 1/(sizeof(blake2b_param) == BLAKE2B_OUTBYTES)
+  };
+
+  /* Streaming API */
+  int _cryptonite_blake2s_init( blake2s_state *S, size_t outlen );
+  int _cryptonite_blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen );
+  int _cryptonite_blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+  int _cryptonite_blake2s_update( blake2s_state *S, const void *in, size_t inlen );
+  int _cryptonite_blake2s_final( blake2s_state *S, void *out, size_t outlen );
+
+  int _cryptonite_blake2b_init( blake2b_state *S, size_t outlen );
+  int _cryptonite_blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen );
+  int _cryptonite_blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+  int _cryptonite_blake2b_update( blake2b_state *S, const void *in, size_t inlen );
+  int _cryptonite_blake2b_final( blake2b_state *S, void *out, size_t outlen );
+
+  int _cryptonite_blake2sp_init( blake2sp_state *S, size_t outlen );
+  int _cryptonite_blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen );
+  int _cryptonite_blake2sp_update( blake2sp_state *S, const void *in, size_t inlen );
+  int _cryptonite_blake2sp_final( blake2sp_state *S, void *out, size_t outlen );
+
+  int _cryptonite_blake2bp_init( blake2bp_state *S, size_t outlen );
+  int _cryptonite_blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen );
+  int _cryptonite_blake2bp_update( blake2bp_state *S, const void *in, size_t inlen );
+  int _cryptonite_blake2bp_final( blake2bp_state *S, void *out, size_t outlen );
+
+  /* Variable output length API */
+  int _cryptonite_blake2xs_init( blake2xs_state *S, const size_t outlen );
+  int _cryptonite_blake2xs_init_key( blake2xs_state *S, const size_t outlen, const void *key, size_t keylen );
+  int _cryptonite_blake2xs_update( blake2xs_state *S, const void *in, size_t inlen );
+  int _cryptonite_blake2xs_final(blake2xs_state *S, void *out, size_t outlen);
+
+  int _cryptonite_blake2xb_init( blake2xb_state *S, const size_t outlen );
+  int _cryptonite_blake2xb_init_key( blake2xb_state *S, const size_t outlen, const void *key, size_t keylen );
+  int _cryptonite_blake2xb_update( blake2xb_state *S, const void *in, size_t inlen );
+  int _cryptonite_blake2xb_final(blake2xb_state *S, void *out, size_t outlen);
+
+  /* Simple API */
+  int _cryptonite_blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+  int _cryptonite_blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+  int _cryptonite_blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+  int _cryptonite_blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+  int _cryptonite_blake2xs( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+  int _cryptonite_blake2xb( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+  /* This is simply an alias for blake2b */
+  int _cryptonite_blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2b-load-sse2.h b/bundled/cbits/blake2/sse/blake2b-load-sse2.h
new file mode 100644
index 0000000..23a8d40
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2b-load-sse2.h
@@ -0,0 +1,68 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2B_LOAD_SSE2_H
+#define BLAKE2B_LOAD_SSE2_H
+
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2b-load-sse41.h b/bundled/cbits/blake2/sse/blake2b-load-sse41.h
new file mode 100644
index 0000000..0eca865
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2b-load-sse41.h
@@ -0,0 +1,402 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2B_LOAD_SSE41_H
+#define BLAKE2B_LOAD_SSE41_H
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m5, 8); \
+b1 = _mm_unpackhi_epi64(m2, m7); \
+} while(0)
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m0); \
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+b1 = _mm_unpackhi_epi64(m3, m4); \
+} while(0)
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m3); \
+b1 = _mm_alignr_epi8(m2, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_unpackhi_epi64(m6, m5); \
+} while(0)
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m0); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m5); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m2); \
+b1 = _mm_unpacklo_epi64(m1, m5); \
+} while(0)
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m0, 8); \
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m3); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m5); \
+b1 = _mm_unpackhi_epi64(m5, m1); \
+} while(0)
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+b1 = _mm_unpackhi_epi64(m7, m0); \
+} while(0)
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m2); \
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+b1 = _mm_unpacklo_epi64(m7, m2); \
+} while(0)
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_alignr_epi8(m5, m6, 8); \
+} while(0)
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m3); \
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+} while(0)
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m3); \
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpackhi_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_unpacklo_epi64(m4, m1); \
+} while(0)
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m2); \
+b1 = _mm_unpacklo_epi64(m3, m5); \
+} while(0)
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m7); \
+b1 = _mm_alignr_epi8(m0, m5, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_alignr_epi8(m4, m1, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+b0 = m6; \
+b1 = _mm_alignr_epi8(m5, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+b1 = m2; \
+} while(0)
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_unpackhi_epi64(m3, m0); \
+} while(0)
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m2); \
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_unpackhi_epi64(m1, m6); \
+} while(0)
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpacklo_epi64(m6, m0); \
+} while(0)
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2b-round.h b/bundled/cbits/blake2/sse/blake2b-round.h
new file mode 100644
index 0000000..6537fff
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2b-round.h
@@ -0,0 +1,157 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2B_ROUND_H
+#define BLAKE2B_ROUND_H
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+#else
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -32); \
+  row4h = _mm_roti_epi64(row4h, -32); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -24); \
+  row2h = _mm_roti_epi64(row2h, -24); \
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -16); \
+  row4h = _mm_roti_epi64(row4h, -16); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -63); \
+  row2h = _mm_roti_epi64(row2h, -63); \
+
+#if defined(HAVE_SSSE3)
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+#else
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row4l;\
+  t1 = row2l;\
+  row4l = row3l;\
+  row3l = row3h;\
+  row3h = row4l;\
+  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
+  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
+  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
+  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row3l;\
+  row3l = row3h;\
+  row3h = t0;\
+  t0 = row2l;\
+  t1 = row4l;\
+  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
+  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
+  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
+  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
+
+#endif
+
+#if defined(HAVE_SSE41)
+#include "blake2b-load-sse41.h"
+#else
+#include "blake2b-load-sse2.h"
+#endif
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2b.c b/bundled/cbits/blake2/sse/blake2b.c
new file mode 100644
index 0000000..23f4b25
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2b.c
@@ -0,0 +1,373 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+#ifdef _MSC_VER
+#include <intrin.h> /* for _mm_set_epi64x */
+#endif
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2b-round.h"
+
+static const uint64_t blake2b_IV[8] =
+{
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* Some helper functions */
+static void blake2b_set_lastnode( blake2b_state *S )
+{
+  S->f[1] = (uint64_t)-1;
+}
+
+static int blake2b_is_lastblock( const blake2b_state *S )
+{
+  return S->f[0] != 0;
+}
+
+static void blake2b_set_lastblock( blake2b_state *S )
+{
+  if( S->last_node ) blake2b_set_lastnode( S );
+
+  S->f[0] = (uint64_t)-1;
+}
+
+static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
+{
+  S->t[0] += inc;
+  S->t[1] += ( S->t[0] < inc );
+}
+
+/* init xors IV with input parameter block */
+int _cryptonite_blake2b_init_param( blake2b_state *S, const blake2b_param *P )
+{
+  size_t i;
+  /*blake2b_init0( S ); */
+  const unsigned char * v = ( const unsigned char * )( blake2b_IV );
+  const unsigned char * p = ( const unsigned char * )( P );
+  unsigned char * h = ( unsigned char * )( S->h );
+  /* IV XOR ParamBlock */
+  memset( S, 0, sizeof( blake2b_state ) );
+
+  for( i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
+
+  S->outlen = P->digest_length;
+  return 0;
+}
+
+
+/* Some sort of default parameter block initialization, for sequential blake2b */
+int _cryptonite_blake2b_init( blake2b_state *S, size_t outlen )
+{
+  blake2b_param P[1];
+
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = 0;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store32( &P->node_offset, 0 );
+  store32( &P->xof_length, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  return _cryptonite_blake2b_init_param( S, P );
+}
+
+int _cryptonite_blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
+{
+  blake2b_param P[1];
+
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+  if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
+
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = (uint8_t)keylen;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store32( &P->node_offset, 0 );
+  store32( &P->xof_length, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  if( _cryptonite_blake2b_init_param( S, P ) < 0 )
+    return 0;
+
+  {
+    uint8_t block[BLAKE2B_BLOCKBYTES];
+    memset( block, 0, BLAKE2B_BLOCKBYTES );
+    memcpy( block, key, keylen );
+    _cryptonite_blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
+    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+  }
+  return 0;
+}
+
+static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+{
+  __m128i row1l, row1h;
+  __m128i row2l, row2h;
+  __m128i row3l, row3h;
+  __m128i row4l, row4h;
+  __m128i b0, b1;
+  __m128i t0, t1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+  const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
+  const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
+#endif
+#if defined(HAVE_SSE41)
+  const __m128i m0 = LOADU( block + 00 );
+  const __m128i m1 = LOADU( block + 16 );
+  const __m128i m2 = LOADU( block + 32 );
+  const __m128i m3 = LOADU( block + 48 );
+  const __m128i m4 = LOADU( block + 64 );
+  const __m128i m5 = LOADU( block + 80 );
+  const __m128i m6 = LOADU( block + 96 );
+  const __m128i m7 = LOADU( block + 112 );
+#else
+  const uint64_t  m0 = load64(block +  0 * sizeof(uint64_t));
+  const uint64_t  m1 = load64(block +  1 * sizeof(uint64_t));
+  const uint64_t  m2 = load64(block +  2 * sizeof(uint64_t));
+  const uint64_t  m3 = load64(block +  3 * sizeof(uint64_t));
+  const uint64_t  m4 = load64(block +  4 * sizeof(uint64_t));
+  const uint64_t  m5 = load64(block +  5 * sizeof(uint64_t));
+  const uint64_t  m6 = load64(block +  6 * sizeof(uint64_t));
+  const uint64_t  m7 = load64(block +  7 * sizeof(uint64_t));
+  const uint64_t  m8 = load64(block +  8 * sizeof(uint64_t));
+  const uint64_t  m9 = load64(block +  9 * sizeof(uint64_t));
+  const uint64_t m10 = load64(block + 10 * sizeof(uint64_t));
+  const uint64_t m11 = load64(block + 11 * sizeof(uint64_t));
+  const uint64_t m12 = load64(block + 12 * sizeof(uint64_t));
+  const uint64_t m13 = load64(block + 13 * sizeof(uint64_t));
+  const uint64_t m14 = load64(block + 14 * sizeof(uint64_t));
+  const uint64_t m15 = load64(block + 15 * sizeof(uint64_t));
+#endif
+  row1l = LOADU( &S->h[0] );
+  row1h = LOADU( &S->h[2] );
+  row2l = LOADU( &S->h[4] );
+  row2h = LOADU( &S->h[6] );
+  row3l = LOADU( &blake2b_IV[0] );
+  row3h = LOADU( &blake2b_IV[2] );
+  row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
+  row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
+  ROUND( 0 );
+  ROUND( 1 );
+  ROUND( 2 );
+  ROUND( 3 );
+  ROUND( 4 );
+  ROUND( 5 );
+  ROUND( 6 );
+  ROUND( 7 );
+  ROUND( 8 );
+  ROUND( 9 );
+  ROUND( 10 );
+  ROUND( 11 );
+  row1l = _mm_xor_si128( row3l, row1l );
+  row1h = _mm_xor_si128( row3h, row1h );
+  STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
+  STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
+  row2l = _mm_xor_si128( row4l, row2l );
+  row2h = _mm_xor_si128( row4h, row2h );
+  STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
+  STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
+}
+
+
+int _cryptonite_blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
+{
+  const unsigned char * in = (const unsigned char *)pin;
+  if( inlen > 0 )
+  {
+    size_t left = S->buflen;
+    size_t fill = BLAKE2B_BLOCKBYTES - left;
+    if( inlen > fill )
+    {
+      S->buflen = 0;
+      memcpy( S->buf + left, in, fill ); /* Fill buffer */
+      blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
+      blake2b_compress( S, S->buf ); /* Compress */
+      in += fill; inlen -= fill;
+      while(inlen > BLAKE2B_BLOCKBYTES) {
+        blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+        blake2b_compress( S, in );
+        in += BLAKE2B_BLOCKBYTES;
+        inlen -= BLAKE2B_BLOCKBYTES;
+      }
+    }
+    memcpy( S->buf + S->buflen, in, inlen );
+    S->buflen += inlen;
+  }
+  return 0;
+}
+
+
+int _cryptonite_blake2b_final( blake2b_state *S, void *out, size_t outlen )
+{
+  if( out == NULL || outlen < S->outlen )
+    return -1;
+
+  if( blake2b_is_lastblock( S ) )
+    return -1;
+
+  blake2b_increment_counter( S, S->buflen );
+  blake2b_set_lastblock( S );
+  memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
+  blake2b_compress( S, S->buf );
+
+  memcpy( out, &S->h[0], S->outlen );
+  return 0;
+}
+
+
+int _cryptonite_blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+  blake2b_state S[1];
+
+  /* Verify parameters */
+  if ( NULL == in && inlen > 0 ) return -1;
+
+  if ( NULL == out ) return -1;
+
+  if( NULL == key && keylen > 0 ) return -1;
+
+  if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+  if( keylen > BLAKE2B_KEYBYTES ) return -1;
+
+  if( keylen )
+  {
+    if( _cryptonite_blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+  }
+  else
+  {
+    if( _cryptonite_blake2b_init( S, outlen ) < 0 ) return -1;
+  }
+
+  _cryptonite_blake2b_update( S, ( const uint8_t * )in, inlen );
+  _cryptonite_blake2b_final( S, out, outlen );
+  return 0;
+}
+
+int _cryptonite_blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) {
+  return _cryptonite_blake2b(out, outlen, in, inlen, key, keylen);
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+  return _cryptonite_blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 );
+}
+#endif
+
+#if defined(BLAKE2B_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+  uint8_t key[BLAKE2B_KEYBYTES];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
+  size_t i, step;
+
+  for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
+    key[i] = ( uint8_t )i;
+
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+    buf[i] = ( uint8_t )i;
+
+  /* Test simple API */
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+  {
+    uint8_t hash[BLAKE2B_OUTBYTES];
+    blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
+
+    if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+    {
+      goto fail;
+    }
+  }
+
+  /* Test streaming API */
+  for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
+    for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+      uint8_t hash[BLAKE2B_OUTBYTES];
+      blake2b_state S;
+      uint8_t * p = buf;
+      size_t mlen = i;
+      int err = 0;
+
+      if( (err = _cryptonite_blake2b_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
+        goto fail;
+      }
+
+      while (mlen >= step) {
+        if ( (err = _cryptonite_blake2b_update(&S, p, step)) < 0 ) {
+          goto fail;
+        }
+        mlen -= step;
+        p += step;
+      }
+      if ( (err = _cryptonite_blake2b_update(&S, p, mlen)) < 0) {
+        goto fail;
+      }
+      if ( (err = _cryptonite_blake2b_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
+        goto fail;
+      }
+
+      if (0 != memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) {
+        goto fail;
+      }
+    }
+  }
+
+  puts( "ok" );
+  return 0;
+fail:
+  puts("error");
+  return -1;
+}
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2bp.c b/bundled/cbits/blake2/sse/blake2bp.c
new file mode 100644
index 0000000..b51edb4
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2bp.c
@@ -0,0 +1,361 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#define PARALLELISM_DEGREE 4
+
+/*
+  blake2b_init_param defaults to setting the expecting output length
+  from the digest_length parameter block field.
+
+  In some cases, however, we do not want this, as the output length
+  of these instances is given by inner_length instead.
+*/
+static int blake2bp_init_leaf_param( blake2b_state *S, const blake2b_param *P )
+{
+  int err = _cryptonite_blake2b_init_param(S, P);
+  S->outlen = P->inner_length;
+  return err;
+}
+
+static int blake2bp_init_leaf( blake2b_state *S, size_t outlen, size_t keylen, uint64_t offset )
+{
+  blake2b_param P[1];
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
+  P->fanout = PARALLELISM_DEGREE;
+  P->depth = 2;
+  P->leaf_length = 0;
+  P->node_offset = offset;
+  P->xof_length = 0;
+  P->node_depth = 0;
+  P->inner_length = BLAKE2B_OUTBYTES;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt, 0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+  return blake2bp_init_leaf_param( S, P );
+}
+
+static int blake2bp_init_root( blake2b_state *S, size_t outlen, size_t keylen )
+{
+  blake2b_param P[1];
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
+  P->fanout = PARALLELISM_DEGREE;
+  P->depth = 2;
+  P->leaf_length = 0;
+  P->node_offset = 0;
+  P->xof_length = 0;
+  P->node_depth = 1;
+  P->inner_length = BLAKE2B_OUTBYTES;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt, 0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+  return _cryptonite_blake2b_init_param( S, P );
+}
+
+
+int _cryptonite_blake2bp_init( blake2bp_state *S, size_t outlen )
+{
+  size_t i;
+  if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+  memset( S->buf, 0, sizeof( S->buf ) );
+  S->buflen = 0;
+  S->outlen = outlen;
+
+  if( blake2bp_init_root( S->R, outlen, 0 ) < 0 )
+    return -1;
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    if( blake2bp_init_leaf( S->S[i], outlen, 0, i ) < 0 ) return -1;
+
+  S->R->last_node = 1;
+  S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+  return 0;
+}
+
+int _cryptonite_blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen )
+{
+  size_t i;
+
+  if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+  if( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
+
+  memset( S->buf, 0, sizeof( S->buf ) );
+  S->buflen = 0;
+  S->outlen = outlen;
+
+  if( blake2bp_init_root( S->R, outlen, keylen ) < 0 )
+    return -1;
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    if( blake2bp_init_leaf( S->S[i], outlen, keylen, i ) < 0 ) return -1;
+
+  S->R->last_node = 1;
+  S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+  {
+    uint8_t block[BLAKE2B_BLOCKBYTES];
+    memset( block, 0, BLAKE2B_BLOCKBYTES );
+    memcpy( block, key, keylen );
+
+    for( i = 0; i < PARALLELISM_DEGREE; ++i )
+      _cryptonite_blake2b_update( S->S[i], block, BLAKE2B_BLOCKBYTES );
+
+    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+  }
+  return 0;
+}
+
+
+int _cryptonite_blake2bp_update( blake2bp_state *S, const void *pin, size_t inlen )
+{
+  const unsigned char * in = (const unsigned char *)pin;
+  size_t left = S->buflen;
+  size_t fill = sizeof( S->buf ) - left;
+  size_t i;
+
+  if( left && inlen >= fill )
+  {
+    memcpy( S->buf + left, in, fill );
+
+    for( i = 0; i < PARALLELISM_DEGREE; ++i )
+      _cryptonite_blake2b_update( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES );
+
+    in += fill;
+    inlen -= fill;
+    left = 0;
+  }
+
+#if defined(_OPENMP)
+  #pragma omp parallel shared(S), num_threads(PARALLELISM_DEGREE)
+#else
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+  {
+#if defined(_OPENMP)
+    size_t      i = omp_get_thread_num();
+#endif
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
+    in__ += i * BLAKE2B_BLOCKBYTES;
+
+    while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
+    {
+      _cryptonite_blake2b_update( S->S[i], in__, BLAKE2B_BLOCKBYTES );
+      in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+      inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+    }
+  }
+
+  in += inlen - inlen % ( PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES );
+  inlen %= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+
+  if( inlen > 0 )
+    memcpy( S->buf + left, in, inlen );
+
+  S->buflen = left + inlen;
+  return 0;
+}
+
+
+
+int _cryptonite_blake2bp_final( blake2bp_state *S, void *out, size_t outlen )
+{
+  uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
+  size_t i;
+
+  if(out == NULL || outlen < S->outlen) {
+    return -1;
+  }
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+  {
+    if( S->buflen > i * BLAKE2B_BLOCKBYTES )
+    {
+      size_t left = S->buflen - i * BLAKE2B_BLOCKBYTES;
+
+      if( left > BLAKE2B_BLOCKBYTES ) left = BLAKE2B_BLOCKBYTES;
+
+      _cryptonite_blake2b_update( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, left );
+    }
+
+    _cryptonite_blake2b_final( S->S[i], hash[i], BLAKE2B_OUTBYTES );
+  }
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    _cryptonite_blake2b_update( S->R, hash[i], BLAKE2B_OUTBYTES );
+
+  return _cryptonite_blake2b_final( S->R, out, S->outlen );
+}
+
+int _cryptonite_blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+  uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
+  blake2b_state S[PARALLELISM_DEGREE][1];
+  blake2b_state FS[1];
+  size_t i;
+
+  /* Verify parameters */
+  if ( NULL == in && inlen > 0 ) return -1;
+
+  if ( NULL == out ) return -1;
+
+  if( NULL == key && keylen > 0 ) return -1;
+
+  if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+  if( keylen > BLAKE2B_KEYBYTES ) return -1;
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    if( blake2bp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
+
+  S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
+
+  if( keylen > 0 )
+  {
+    uint8_t block[BLAKE2B_BLOCKBYTES];
+    memset( block, 0, BLAKE2B_BLOCKBYTES );
+    memcpy( block, key, keylen );
+
+    for( i = 0; i < PARALLELISM_DEGREE; ++i )
+      _cryptonite_blake2b_update( S[i], block, BLAKE2B_BLOCKBYTES );
+
+    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+  }
+
+#if defined(_OPENMP)
+  #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE)
+#else
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+  {
+#if defined(_OPENMP)
+    size_t      i = omp_get_thread_num();
+#endif
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
+    in__ += i * BLAKE2B_BLOCKBYTES;
+
+    while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
+    {
+      _cryptonite_blake2b_update( S[i], in__, BLAKE2B_BLOCKBYTES );
+      in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+      inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+    }
+
+    if( inlen__ > i * BLAKE2B_BLOCKBYTES )
+    {
+      const size_t left = inlen__ - i * BLAKE2B_BLOCKBYTES;
+      const size_t len = left <= BLAKE2B_BLOCKBYTES ? left : BLAKE2B_BLOCKBYTES;
+      _cryptonite_blake2b_update( S[i], in__, len );
+    }
+
+    _cryptonite_blake2b_final( S[i], hash[i], BLAKE2B_OUTBYTES );
+  }
+
+  if( blake2bp_init_root( FS, outlen, keylen ) < 0 )
+    return -1;
+
+  FS->last_node = 1; /* Mark as last node */
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    _cryptonite_blake2b_update( FS, hash[i], BLAKE2B_OUTBYTES );
+
+  return _cryptonite_blake2b_final( FS, out, outlen );
+}
+
+
+#if defined(BLAKE2BP_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+  uint8_t key[BLAKE2B_KEYBYTES];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
+  size_t i, step;
+
+  for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
+    key[i] = ( uint8_t )i;
+
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+    buf[i] = ( uint8_t )i;
+
+  /* Test simple API */
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+  {
+    uint8_t hash[BLAKE2B_OUTBYTES];
+    _cryptonite_blake2bp( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
+
+    if( 0 != memcmp( hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+    {
+      goto fail;
+    }
+  }
+
+  /* Test streaming API */
+  for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
+    for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+      uint8_t hash[BLAKE2B_OUTBYTES];
+      blake2bp_state S;
+      uint8_t * p = buf;
+      size_t mlen = i;
+      int err = 0;
+
+      if( (err = _cryptonite_blake2bp_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
+        goto fail;
+      }
+
+      while (mlen >= step) {
+        if ( (err = _cryptonite_blake2bp_update(&S, p, step)) < 0 ) {
+          goto fail;
+        }
+        mlen -= step;
+        p += step;
+      }
+      if ( (err = _cryptonite_blake2bp_update(&S, p, mlen)) < 0) {
+        goto fail;
+      }
+      if ( (err = _cryptonite_blake2bp_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
+        goto fail;
+      }
+
+      if (0 != memcmp(hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES)) {
+        goto fail;
+      }
+    }
+  }
+
+  puts( "ok" );
+  return 0;
+fail:
+  puts("error");
+  return -1;
+}
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2s-load-sse2.h b/bundled/cbits/blake2/sse/blake2s-load-sse2.h
new file mode 100644
index 0000000..d2e9a09
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2s-load-sse2.h
@@ -0,0 +1,60 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2S_LOAD_SSE2_H
+#define BLAKE2S_LOAD_SSE2_H
+
+#define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6,m4,m2,m0)
+#define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7,m5,m3,m1)
+#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m14,m12,m10,m8)
+#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m15,m13,m11,m9)
+#define LOAD_MSG_1_1(buf) buf = _mm_set_epi32(m13,m9,m4,m14)
+#define LOAD_MSG_1_2(buf) buf = _mm_set_epi32(m6,m15,m8,m10)
+#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m5,m11,m0,m1)
+#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m3,m7,m2,m12)
+#define LOAD_MSG_2_1(buf) buf = _mm_set_epi32(m15,m5,m12,m11)
+#define LOAD_MSG_2_2(buf) buf = _mm_set_epi32(m13,m2,m0,m8)
+#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m9,m7,m3,m10)
+#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m4,m1,m6,m14)
+#define LOAD_MSG_3_1(buf) buf = _mm_set_epi32(m11,m13,m3,m7)
+#define LOAD_MSG_3_2(buf) buf = _mm_set_epi32(m14,m12,m1,m9)
+#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m15,m4,m5,m2)
+#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m8,m0,m10,m6)
+#define LOAD_MSG_4_1(buf) buf = _mm_set_epi32(m10,m2,m5,m9)
+#define LOAD_MSG_4_2(buf) buf = _mm_set_epi32(m15,m4,m7,m0)
+#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m3,m6,m11,m14)
+#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m13,m8,m12,m1)
+#define LOAD_MSG_5_1(buf) buf = _mm_set_epi32(m8,m0,m6,m2)
+#define LOAD_MSG_5_2(buf) buf = _mm_set_epi32(m3,m11,m10,m12)
+#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m1,m15,m7,m4)
+#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m9,m14,m5,m13)
+#define LOAD_MSG_6_1(buf) buf = _mm_set_epi32(m4,m14,m1,m12)
+#define LOAD_MSG_6_2(buf) buf = _mm_set_epi32(m10,m13,m15,m5)
+#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m8,m9,m6,m0)
+#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m11,m2,m3,m7)
+#define LOAD_MSG_7_1(buf) buf = _mm_set_epi32(m3,m12,m7,m13)
+#define LOAD_MSG_7_2(buf) buf = _mm_set_epi32(m9,m1,m14,m11)
+#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m2,m8,m15,m5)
+#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m10,m6,m4,m0)
+#define LOAD_MSG_8_1(buf) buf = _mm_set_epi32(m0,m11,m14,m6)
+#define LOAD_MSG_8_2(buf) buf = _mm_set_epi32(m8,m3,m9,m15)
+#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m10,m1,m13,m12)
+#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m5,m4,m7,m2)
+#define LOAD_MSG_9_1(buf) buf = _mm_set_epi32(m1,m7,m8,m10)
+#define LOAD_MSG_9_2(buf) buf = _mm_set_epi32(m5,m6,m4,m2)
+#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m13,m3,m9,m15)
+#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m0,m12,m14,m11)
+
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2s-load-sse41.h b/bundled/cbits/blake2/sse/blake2s-load-sse41.h
new file mode 100644
index 0000000..c316fb5
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2s-load-sse41.h
@@ -0,0 +1,229 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2S_LOAD_SSE41_H
+#define BLAKE2S_LOAD_SSE41_H
+
+#define LOAD_MSG_0_1(buf) \
+buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
+
+#define LOAD_MSG_0_2(buf) \
+buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
+
+#define LOAD_MSG_0_3(buf) \
+buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
+
+#define LOAD_MSG_0_4(buf) \
+buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
+
+#define LOAD_MSG_1_1(buf) \
+t0 = _mm_blend_epi16(m1, m2, 0x0C); \
+t1 = _mm_slli_si128(m3, 4); \
+t2 = _mm_blend_epi16(t0, t1, 0xF0); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
+
+#define LOAD_MSG_1_2(buf) \
+t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
+t1 = _mm_blend_epi16(m1,m3,0xC0); \
+t2 = _mm_blend_epi16(t0, t1, 0xF0); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
+
+#define LOAD_MSG_1_3(buf) \
+t0 = _mm_slli_si128(m1, 4); \
+t1 = _mm_blend_epi16(m2, t0, 0x30); \
+t2 = _mm_blend_epi16(m0, t1, 0xF0); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
+
+#define LOAD_MSG_1_4(buf) \
+t0 = _mm_unpackhi_epi32(m0,m1); \
+t1 = _mm_slli_si128(m3, 4); \
+t2 = _mm_blend_epi16(t0, t1, 0x0C); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
+
+#define LOAD_MSG_2_1(buf) \
+t0 = _mm_unpackhi_epi32(m2,m3); \
+t1 = _mm_blend_epi16(m3,m1,0x0C); \
+t2 = _mm_blend_epi16(t0, t1, 0x0F); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
+
+#define LOAD_MSG_2_2(buf) \
+t0 = _mm_unpacklo_epi32(m2,m0); \
+t1 = _mm_blend_epi16(t0, m0, 0xF0); \
+t2 = _mm_slli_si128(m3, 8); \
+buf = _mm_blend_epi16(t1, t2, 0xC0);
+
+#define LOAD_MSG_2_3(buf) \
+t0 = _mm_blend_epi16(m0, m2, 0x3C); \
+t1 = _mm_srli_si128(m1, 12); \
+t2 = _mm_blend_epi16(t0,t1,0x03); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
+
+#define LOAD_MSG_2_4(buf) \
+t0 = _mm_slli_si128(m3, 4); \
+t1 = _mm_blend_epi16(m0, m1, 0x33); \
+t2 = _mm_blend_epi16(t1, t0, 0xC0); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
+
+#define LOAD_MSG_3_1(buf) \
+t0 = _mm_unpackhi_epi32(m0,m1); \
+t1 = _mm_unpackhi_epi32(t0, m2); \
+t2 = _mm_blend_epi16(t1, m3, 0x0C); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
+
+#define LOAD_MSG_3_2(buf) \
+t0 = _mm_slli_si128(m2, 8); \
+t1 = _mm_blend_epi16(m3,m0,0x0C); \
+t2 = _mm_blend_epi16(t1, t0, 0xC0); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
+
+#define LOAD_MSG_3_3(buf) \
+t0 = _mm_blend_epi16(m0,m1,0x0F); \
+t1 = _mm_blend_epi16(t0, m3, 0xC0); \
+buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
+
+#define LOAD_MSG_3_4(buf) \
+t0 = _mm_unpacklo_epi32(m0,m2); \
+t1 = _mm_unpackhi_epi32(m1,m2); \
+buf = _mm_unpacklo_epi64(t1,t0);
+
+#define LOAD_MSG_4_1(buf) \
+t0 = _mm_unpacklo_epi64(m1,m2); \
+t1 = _mm_unpackhi_epi64(m0,m2); \
+t2 = _mm_blend_epi16(t0,t1,0x33); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
+
+#define LOAD_MSG_4_2(buf) \
+t0 = _mm_unpackhi_epi64(m1,m3); \
+t1 = _mm_unpacklo_epi64(m0,m1); \
+buf = _mm_blend_epi16(t0,t1,0x33);
+
+#define LOAD_MSG_4_3(buf) \
+t0 = _mm_unpackhi_epi64(m3,m1); \
+t1 = _mm_unpackhi_epi64(m2,m0); \
+buf = _mm_blend_epi16(t1,t0,0x33);
+
+#define LOAD_MSG_4_4(buf) \
+t0 = _mm_blend_epi16(m0,m2,0x03); \
+t1 = _mm_slli_si128(t0, 8); \
+t2 = _mm_blend_epi16(t1,m3,0x0F); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
+
+#define LOAD_MSG_5_1(buf) \
+t0 = _mm_unpackhi_epi32(m0,m1); \
+t1 = _mm_unpacklo_epi32(m0,m2); \
+buf = _mm_unpacklo_epi64(t0,t1);
+
+#define LOAD_MSG_5_2(buf) \
+t0 = _mm_srli_si128(m2, 4); \
+t1 = _mm_blend_epi16(m0,m3,0x03); \
+buf = _mm_blend_epi16(t1,t0,0x3C);
+
+#define LOAD_MSG_5_3(buf) \
+t0 = _mm_blend_epi16(m1,m0,0x0C); \
+t1 = _mm_srli_si128(m3, 4); \
+t2 = _mm_blend_epi16(t0,t1,0x30); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
+
+#define LOAD_MSG_5_4(buf) \
+t0 = _mm_unpacklo_epi64(m1,m2); \
+t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
+buf = _mm_blend_epi16(t0,t1,0x33);
+
+#define LOAD_MSG_6_1(buf) \
+t0 = _mm_slli_si128(m1, 12); \
+t1 = _mm_blend_epi16(m0,m3,0x33); \
+buf = _mm_blend_epi16(t1,t0,0xC0);
+
+#define LOAD_MSG_6_2(buf) \
+t0 = _mm_blend_epi16(m3,m2,0x30); \
+t1 = _mm_srli_si128(m1, 4); \
+t2 = _mm_blend_epi16(t0,t1,0x03); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
+
+#define LOAD_MSG_6_3(buf) \
+t0 = _mm_unpacklo_epi64(m0,m2); \
+t1 = _mm_srli_si128(m1, 4); \
+buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
+
+#define LOAD_MSG_6_4(buf) \
+t0 = _mm_unpackhi_epi32(m1,m2); \
+t1 = _mm_unpackhi_epi64(m0,t0); \
+buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
+
+#define LOAD_MSG_7_1(buf) \
+t0 = _mm_unpackhi_epi32(m0,m1); \
+t1 = _mm_blend_epi16(t0,m3,0x0F); \
+buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
+
+#define LOAD_MSG_7_2(buf) \
+t0 = _mm_blend_epi16(m2,m3,0x30); \
+t1 = _mm_srli_si128(m0,4); \
+t2 = _mm_blend_epi16(t0,t1,0x03); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
+
+#define LOAD_MSG_7_3(buf) \
+t0 = _mm_unpackhi_epi64(m0,m3); \
+t1 = _mm_unpacklo_epi64(m1,m2); \
+t2 = _mm_blend_epi16(t0,t1,0x3C); \
+buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
+
+#define LOAD_MSG_7_4(buf) \
+t0 = _mm_unpacklo_epi32(m0,m1); \
+t1 = _mm_unpackhi_epi32(m1,m2); \
+buf = _mm_unpacklo_epi64(t0,t1);
+
+#define LOAD_MSG_8_1(buf) \
+t0 = _mm_unpackhi_epi32(m1,m3); \
+t1 = _mm_unpacklo_epi64(t0,m0); \
+t2 = _mm_blend_epi16(t1,m2,0xC0); \
+buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
+
+#define LOAD_MSG_8_2(buf) \
+t0 = _mm_unpackhi_epi32(m0,m3); \
+t1 = _mm_blend_epi16(m2,t0,0xF0); \
+buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
+
+#define LOAD_MSG_8_3(buf) \
+t0 = _mm_blend_epi16(m2,m0,0x0C); \
+t1 = _mm_slli_si128(t0,4); \
+buf = _mm_blend_epi16(t1,m3,0x0F);
+
+#define LOAD_MSG_8_4(buf) \
+t0 = _mm_blend_epi16(m1,m0,0x30); \
+buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
+
+#define LOAD_MSG_9_1(buf) \
+t0 = _mm_blend_epi16(m0,m2,0x03); \
+t1 = _mm_blend_epi16(m1,m2,0x30); \
+t2 = _mm_blend_epi16(t1,t0,0x0F); \
+buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
+
+#define LOAD_MSG_9_2(buf) \
+t0 = _mm_slli_si128(m0,4); \
+t1 = _mm_blend_epi16(m1,t0,0xC0); \
+buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
+
+#define LOAD_MSG_9_3(buf) \
+t0 = _mm_unpackhi_epi32(m0,m3); \
+t1 = _mm_unpacklo_epi32(m2,m3); \
+t2 = _mm_unpackhi_epi64(t0,t1); \
+buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
+
+#define LOAD_MSG_9_4(buf) \
+t0 = _mm_blend_epi16(m3,m2,0xC0); \
+t1 = _mm_unpacklo_epi32(m0,m3); \
+t2 = _mm_blend_epi16(t0,t1,0x0F); \
+buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2s-load-xop.h b/bundled/cbits/blake2/sse/blake2s-load-xop.h
new file mode 100644
index 0000000..a97ddcc
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2s-load-xop.h
@@ -0,0 +1,191 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2S_LOAD_XOP_H
+#define BLAKE2S_LOAD_XOP_H
+
+#define TOB(x) ((x)*4*0x01010101 + 0x03020100) /* ..or not TOB */
+
+#if 0
+/* Basic VPPERM emulation, for testing purposes */
+static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel)
+{
+   const __m128i sixteen = _mm_set1_epi8(16);
+   const __m128i t0 = _mm_shuffle_epi8(src1, sel);
+   const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen));
+   const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen),
+                                     _mm_cmpgt_epi8(sel, sixteen)); /* (>=16) = 0xff : 00 */
+   return _mm_blendv_epi8(t0, s1, mask);
+}
+#endif
+
+#define LOAD_MSG_0_1(buf) \
+buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) );
+
+#define LOAD_MSG_0_2(buf) \
+buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) );
+
+#define LOAD_MSG_0_3(buf) \
+buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) );
+
+#define LOAD_MSG_0_4(buf) \
+buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) );
+
+#define LOAD_MSG_1_1(buf) \
+t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) );
+
+#define LOAD_MSG_1_2(buf) \
+t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_1_3(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_1_4(buf) \
+t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) );
+
+#define LOAD_MSG_2_1(buf) \
+t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(4),TOB(0)) );
+
+#define LOAD_MSG_2_2(buf) \
+t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(2),TOB(0),TOB(4)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_2_3(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) );
+
+#define LOAD_MSG_2_4(buf) \
+t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) );
+
+#define LOAD_MSG_3_1(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \
+t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_3_2(buf) \
+t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_3_3(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_3_4(buf) \
+t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \
+buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) );
+
+#define LOAD_MSG_4_1(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(5)) );
+
+#define LOAD_MSG_4_2(buf) \
+t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_4_3(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \
+t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) );
+
+#define LOAD_MSG_4_4(buf) \
+t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) );
+
+#define LOAD_MSG_5_1(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_5_2(buf) \
+t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(6),TOB(0)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) );
+
+#define LOAD_MSG_5_3(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_5_4(buf) \
+t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) );
+
+#define LOAD_MSG_6_1(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(4)) );
+
+#define LOAD_MSG_6_2(buf) \
+t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(6),TOB(0),TOB(0),TOB(1)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) );
+
+#define LOAD_MSG_6_3(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_6_4(buf) \
+t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \
+buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_7_1(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(5)) );
+
+#define LOAD_MSG_7_2(buf) \
+t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) );
+
+#define LOAD_MSG_7_3(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \
+t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) );
+
+#define LOAD_MSG_7_4(buf) \
+t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \
+buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) );
+
+#define LOAD_MSG_8_1(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \
+t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) );
+
+#define LOAD_MSG_8_2(buf) \
+t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) );
+
+#define LOAD_MSG_8_3(buf) \
+t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \
+
+#define LOAD_MSG_8_4(buf) \
+buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) );
+
+#define LOAD_MSG_9_1(buf) \
+t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) );
+
+#define LOAD_MSG_9_2(buf) \
+buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) );
+
+#define LOAD_MSG_9_3(buf) \
+t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) );
+
+#define LOAD_MSG_9_4(buf) \
+t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) );
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2s-round.h b/bundled/cbits/blake2/sse/blake2s-round.h
new file mode 100644
index 0000000..44a5574
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2s-round.h
@@ -0,0 +1,88 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2S_ROUND_H
+#define BLAKE2S_ROUND_H
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi32(r, c) ( \
+                (8==-(c)) ? _mm_shuffle_epi8(r,r8) \
+              : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \
+              : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) )
+#else
+#define _mm_roti_epi32(r, c) _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+#define G1(row1,row2,row3,row4,buf) \
+  row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
+  row4 = _mm_xor_si128( row4, row1 ); \
+  row4 = _mm_roti_epi32(row4, -16); \
+  row3 = _mm_add_epi32( row3, row4 );   \
+  row2 = _mm_xor_si128( row2, row3 ); \
+  row2 = _mm_roti_epi32(row2, -12);
+
+#define G2(row1,row2,row3,row4,buf) \
+  row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
+  row4 = _mm_xor_si128( row4, row1 ); \
+  row4 = _mm_roti_epi32(row4, -8); \
+  row3 = _mm_add_epi32( row3, row4 );   \
+  row2 = _mm_xor_si128( row2, row3 ); \
+  row2 = _mm_roti_epi32(row2, -7);
+
+#define DIAGONALIZE(row1,row2,row3,row4) \
+  row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \
+  row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
+  row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) );
+
+#define UNDIAGONALIZE(row1,row2,row3,row4) \
+  row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \
+  row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
+  row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) );
+
+#if defined(HAVE_XOP)
+#include "blake2s-load-xop.h"
+#elif defined(HAVE_SSE41)
+#include "blake2s-load-sse41.h"
+#else
+#include "blake2s-load-sse2.h"
+#endif
+
+#define ROUND(r)  \
+  LOAD_MSG_ ##r ##_1(buf1); \
+  G1(row1,row2,row3,row4,buf1); \
+  LOAD_MSG_ ##r ##_2(buf2); \
+  G2(row1,row2,row3,row4,buf2); \
+  DIAGONALIZE(row1,row2,row3,row4); \
+  LOAD_MSG_ ##r ##_3(buf3); \
+  G1(row1,row2,row3,row4,buf3); \
+  LOAD_MSG_ ##r ##_4(buf4); \
+  G2(row1,row2,row3,row4,buf4); \
+  UNDIAGONALIZE(row1,row2,row3,row4); \
+
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2s.c b/bundled/cbits/blake2/sse/blake2s.c
new file mode 100644
index 0000000..b983465
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2s.c
@@ -0,0 +1,363 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2s-round.h"
+
+static const uint32_t blake2s_IV[8] =
+{
+  0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+  0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+/* Some helper functions */
+static void blake2s_set_lastnode( blake2s_state *S )
+{
+  S->f[1] = (uint32_t)-1;
+}
+
+static int blake2s_is_lastblock( const blake2s_state *S )
+{
+  return S->f[0] != 0;
+}
+
+static void blake2s_set_lastblock( blake2s_state *S )
+{
+  if( S->last_node ) blake2s_set_lastnode( S );
+
+  S->f[0] = (uint32_t)-1;
+}
+
+static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+{
+  uint64_t t = ( ( uint64_t )S->t[1] << 32 ) | S->t[0];
+  t += inc;
+  S->t[0] = ( uint32_t )( t >>  0 );
+  S->t[1] = ( uint32_t )( t >> 32 );
+}
+
+/* init2 xors IV with input parameter block */
+int _cryptonite_blake2s_init_param( blake2s_state *S, const blake2s_param *P )
+{
+  size_t i;
+  /*blake2s_init0( S ); */
+  const uint8_t * v = ( const uint8_t * )( blake2s_IV );
+  const uint8_t * p = ( const uint8_t * )( P );
+  uint8_t * h = ( uint8_t * )( S->h );
+  /* IV XOR ParamBlock */
+  memset( S, 0, sizeof( blake2s_state ) );
+
+  for( i = 0; i < BLAKE2S_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
+
+  S->outlen = P->digest_length;
+  return 0;
+}
+
+
+/* Some sort of default parameter block initialization, for sequential blake2s */
+int _cryptonite_blake2s_init( blake2s_state *S, size_t outlen )
+{
+  blake2s_param P[1];
+
+  /* Move interval verification here? */
+  if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = 0;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store32( &P->node_offset, 0 );
+  store16( &P->xof_length, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  return _cryptonite_blake2s_init_param( S, P );
+}
+
+
+int _cryptonite_blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen )
+{
+  blake2s_param P[1];
+
+  /* Move interval verification here? */
+  if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+  if ( ( !key ) || ( !keylen ) || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = (uint8_t)keylen;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store32( &P->node_offset, 0 );
+  store16( &P->xof_length, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  if( _cryptonite_blake2s_init_param( S, P ) < 0 )
+    return -1;
+
+  {
+    uint8_t block[BLAKE2S_BLOCKBYTES];
+    memset( block, 0, BLAKE2S_BLOCKBYTES );
+    memcpy( block, key, keylen );
+    _cryptonite_blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
+    secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+  }
+  return 0;
+}
+
+
+static void blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+{
+  __m128i row1, row2, row3, row4;
+  __m128i buf1, buf2, buf3, buf4;
+#if defined(HAVE_SSE41)
+  __m128i t0, t1;
+#if !defined(HAVE_XOP)
+  __m128i t2;
+#endif
+#endif
+  __m128i ff0, ff1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+  const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
+  const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
+#endif
+#if defined(HAVE_SSE41)
+  const __m128i m0 = LOADU( block +  00 );
+  const __m128i m1 = LOADU( block +  16 );
+  const __m128i m2 = LOADU( block +  32 );
+  const __m128i m3 = LOADU( block +  48 );
+#else
+  const uint32_t  m0 = load32(block +  0 * sizeof(uint32_t));
+  const uint32_t  m1 = load32(block +  1 * sizeof(uint32_t));
+  const uint32_t  m2 = load32(block +  2 * sizeof(uint32_t));
+  const uint32_t  m3 = load32(block +  3 * sizeof(uint32_t));
+  const uint32_t  m4 = load32(block +  4 * sizeof(uint32_t));
+  const uint32_t  m5 = load32(block +  5 * sizeof(uint32_t));
+  const uint32_t  m6 = load32(block +  6 * sizeof(uint32_t));
+  const uint32_t  m7 = load32(block +  7 * sizeof(uint32_t));
+  const uint32_t  m8 = load32(block +  8 * sizeof(uint32_t));
+  const uint32_t  m9 = load32(block +  9 * sizeof(uint32_t));
+  const uint32_t m10 = load32(block + 10 * sizeof(uint32_t));
+  const uint32_t m11 = load32(block + 11 * sizeof(uint32_t));
+  const uint32_t m12 = load32(block + 12 * sizeof(uint32_t));
+  const uint32_t m13 = load32(block + 13 * sizeof(uint32_t));
+  const uint32_t m14 = load32(block + 14 * sizeof(uint32_t));
+  const uint32_t m15 = load32(block + 15 * sizeof(uint32_t));
+#endif
+  row1 = ff0 = LOADU( &S->h[0] );
+  row2 = ff1 = LOADU( &S->h[4] );
+  row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] );
+  row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &S->t[0] ) );
+  ROUND( 0 );
+  ROUND( 1 );
+  ROUND( 2 );
+  ROUND( 3 );
+  ROUND( 4 );
+  ROUND( 5 );
+  ROUND( 6 );
+  ROUND( 7 );
+  ROUND( 8 );
+  ROUND( 9 );
+  STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
+  STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
+}
+
+int _cryptonite_blake2s_update( blake2s_state *S, const void *pin, size_t inlen )
+{
+  const unsigned char * in = (const unsigned char *)pin;
+  if( inlen > 0 )
+  {
+    size_t left = S->buflen;
+    size_t fill = BLAKE2S_BLOCKBYTES - left;
+    if( inlen > fill )
+    {
+      S->buflen = 0;
+      memcpy( S->buf + left, in, fill ); /* Fill buffer */
+      blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+      blake2s_compress( S, S->buf ); /* Compress */
+      in += fill; inlen -= fill;
+      while(inlen > BLAKE2S_BLOCKBYTES) {
+        blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES);
+        blake2s_compress( S, in );
+        in += BLAKE2S_BLOCKBYTES;
+        inlen -= BLAKE2S_BLOCKBYTES;
+      }
+    }
+    memcpy( S->buf + S->buflen, in, inlen );
+    S->buflen += inlen;
+  }
+  return 0;
+}
+
+int _cryptonite_blake2s_final( blake2s_state *S, void *out, size_t outlen )
+{
+  uint8_t buffer[BLAKE2S_OUTBYTES] = {0};
+  size_t i;
+
+  if( out == NULL || outlen < S->outlen )
+    return -1;
+
+  if( blake2s_is_lastblock( S ) )
+    return -1;
+
+  blake2s_increment_counter( S, (uint32_t)S->buflen );
+  blake2s_set_lastblock( S );
+  memset( S->buf + S->buflen, 0, BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+  blake2s_compress( S, S->buf );
+
+  for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+    store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+  memcpy( out, buffer, S->outlen );
+  secure_zero_memory( buffer, sizeof(buffer) );
+  return 0;
+}
+
+/* inlen, at least, should be uint64_t. Others can be size_t. */
+int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+  blake2s_state S[1];
+
+  /* Verify parameters */
+  if ( NULL == in && inlen > 0 ) return -1;
+
+  if ( NULL == out ) return -1;
+
+  if ( NULL == key && keylen > 0) return -1;
+
+  if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+  if( keylen > BLAKE2S_KEYBYTES ) return -1;
+
+  if( keylen > 0 )
+  {
+    if( _cryptonite_blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+  }
+  else
+  {
+    if( _cryptonite_blake2s_init( S, outlen ) < 0 ) return -1;
+  }
+
+  _cryptonite_blake2s_update( S, ( const uint8_t * )in, inlen );
+  _cryptonite_blake2s_final( S, out, outlen );
+  return 0;
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+  return blake2s( out, BLAKE2S_OUTBYTES, in, inlen, NULL, 0 );
+}
+#endif
+
+#if defined(BLAKE2S_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+  uint8_t key[BLAKE2S_KEYBYTES];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
+  size_t i, step;
+
+  for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
+    key[i] = ( uint8_t )i;
+
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+    buf[i] = ( uint8_t )i;
+
+  /* Test simple API */
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+  {
+    uint8_t hash[BLAKE2S_OUTBYTES];
+    blake2s( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
+
+    if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+    {
+      goto fail;
+    }
+  }
+
+  /* Test streaming API */
+  for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) {
+    for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+      uint8_t hash[BLAKE2S_OUTBYTES];
+      blake2s_state S;
+      uint8_t * p = buf;
+      size_t mlen = i;
+      int err = 0;
+
+      if( (err = _cryptonite_blake2s_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) {
+        goto fail;
+      }
+
+      while (mlen >= step) {
+        if ( (err = _cryptonite_blake2s_update(&S, p, step)) < 0 ) {
+          goto fail;
+        }
+        mlen -= step;
+        p += step;
+      }
+      if ( (err = _cryptonite_blake2s_update(&S, p, mlen)) < 0) {
+        goto fail;
+      }
+      if ( (err = _cryptonite_blake2s_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) {
+        goto fail;
+      }
+
+      if (0 != memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) {
+        goto fail;
+      }
+    }
+  }
+
+  puts( "ok" );
+  return 0;
+fail:
+  puts("error");
+  return -1;
+}
+#endif
diff --git a/bundled/cbits/blake2/sse/blake2sp.c b/bundled/cbits/blake2/sse/blake2sp.c
new file mode 100644
index 0000000..550c313
--- /dev/null
+++ b/bundled/cbits/blake2/sse/blake2sp.c
@@ -0,0 +1,358 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#define PARALLELISM_DEGREE 8
+
+/*
+  blake2sp_init_param defaults to setting the expecting output length
+  from the digest_length parameter block field.
+
+  In some cases, however, we do not want this, as the output length
+  of these instances is given by inner_length instead.
+*/
+static int blake2sp_init_leaf_param( blake2s_state *S, const blake2s_param *P )
+{
+  int err = _cryptonite_blake2s_init_param(S, P);
+  S->outlen = P->inner_length;
+  return err;
+}
+
+static int blake2sp_init_leaf( blake2s_state *S, size_t outlen, size_t keylen, uint64_t offset )
+{
+  blake2s_param P[1];
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
+  P->fanout = PARALLELISM_DEGREE;
+  P->depth = 2;
+  P->leaf_length = 0;
+  P->node_offset = offset;
+  P->xof_length = 0;
+  P->node_depth = 0;
+  P->inner_length = BLAKE2S_OUTBYTES;
+  memset( P->salt, 0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+  return blake2sp_init_leaf_param( S, P );
+}
+
+static int blake2sp_init_root( blake2s_state *S, size_t outlen, size_t keylen )
+{
+  blake2s_param P[1];
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
+  P->fanout = PARALLELISM_DEGREE;
+  P->depth = 2;
+  P->leaf_length = 0;
+  P->node_offset = 0;
+  P->xof_length = 0;
+  P->node_depth = 1;
+  P->inner_length = BLAKE2S_OUTBYTES;
+  memset( P->salt, 0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+  return _cryptonite_blake2s_init_param( S, P );
+}
+
+
+int _cryptonite_blake2sp_init( blake2sp_state *S, size_t outlen )
+{
+  size_t i;
+
+  if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+  memset( S->buf, 0, sizeof( S->buf ) );
+  S->buflen = 0;
+  S->outlen = outlen;
+
+  if( blake2sp_init_root( S->R, outlen, 0 ) < 0 )
+    return -1;
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    if( blake2sp_init_leaf( S->S[i], outlen, 0, i ) < 0 ) return -1;
+
+  S->R->last_node = 1;
+  S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+  return 0;
+}
+
+int _cryptonite_blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen )
+{
+  size_t i;
+
+  if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+  if( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+  memset( S->buf, 0, sizeof( S->buf ) );
+  S->buflen = 0;
+  S->outlen = outlen;
+
+  if( blake2sp_init_root( S->R, outlen, keylen ) < 0 )
+    return -1;
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    if( blake2sp_init_leaf( S->S[i], outlen, keylen, i ) < 0 ) return -1;
+
+  S->R->last_node = 1;
+  S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+  {
+    uint8_t block[BLAKE2S_BLOCKBYTES];
+    memset( block, 0, BLAKE2S_BLOCKBYTES );
+    memcpy( block, key, keylen );
+
+    for( i = 0; i < PARALLELISM_DEGREE; ++i )
+      _cryptonite_blake2s_update( S->S[i], block, BLAKE2S_BLOCKBYTES );
+
+    secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+  }
+  return 0;
+}
+
+
+int _cryptonite_blake2sp_update( blake2sp_state *S, const void *pin, size_t inlen )
+{
+  const unsigned char * in = (const unsigned char *)pin;
+  size_t left = S->buflen;
+  size_t fill = sizeof( S->buf ) - left;
+  size_t i;
+
+  if( left && inlen >= fill )
+  {
+    memcpy( S->buf + left, in, fill );
+
+    for( i = 0; i < PARALLELISM_DEGREE; ++i )
+      _cryptonite_blake2s_update( S->S[i], S->buf + i * BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES );
+
+    in += fill;
+    inlen -= fill;
+    left = 0;
+  }
+
+#if defined(_OPENMP)
+  #pragma omp parallel shared(S), num_threads(PARALLELISM_DEGREE)
+#else
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+  {
+#if defined(_OPENMP)
+    size_t      i = omp_get_thread_num();
+#endif
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
+    in__ += i * BLAKE2S_BLOCKBYTES;
+
+    while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES )
+    {
+      _cryptonite_blake2s_update( S->S[i], in__, BLAKE2S_BLOCKBYTES );
+      in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+      inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+    }
+  }
+
+  in += inlen - inlen % ( PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES );
+  inlen %= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+
+  if( inlen > 0 )
+    memcpy( S->buf + left, in, inlen );
+
+  S->buflen = left + inlen;
+  return 0;
+}
+
+
+int _cryptonite_blake2sp_final( blake2sp_state *S, void *out, size_t outlen )
+{
+  uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES];
+  size_t i;
+
+  if(out == NULL || outlen < S->outlen) {
+    return -1;
+  }
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+  {
+    if( S->buflen > i * BLAKE2S_BLOCKBYTES )
+    {
+      size_t left = S->buflen - i * BLAKE2S_BLOCKBYTES;
+
+      if( left > BLAKE2S_BLOCKBYTES ) left = BLAKE2S_BLOCKBYTES;
+
+      _cryptonite_blake2s_update( S->S[i], S->buf + i * BLAKE2S_BLOCKBYTES, left );
+    }
+
+    _cryptonite_blake2s_final( S->S[i], hash[i], BLAKE2S_OUTBYTES );
+  }
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    _cryptonite_blake2s_update( S->R, hash[i], BLAKE2S_OUTBYTES );
+
+  return _cryptonite_blake2s_final( S->R, out, S->outlen );
+}
+
+
+int _cryptonite_blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+  uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES];
+  blake2s_state S[PARALLELISM_DEGREE][1];
+  blake2s_state FS[1];
+  size_t i;
+
+  /* Verify parameters */
+  if ( NULL == in && inlen > 0 ) return -1;
+
+  if ( NULL == out ) return -1;
+
+  if ( NULL == key && keylen > 0) return -1;
+
+  if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+  if( keylen > BLAKE2S_KEYBYTES ) return -1;
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    if( blake2sp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
+
+  S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
+
+  if( keylen > 0 )
+  {
+    uint8_t block[BLAKE2S_BLOCKBYTES];
+    memset( block, 0, BLAKE2S_BLOCKBYTES );
+    memcpy( block, key, keylen );
+
+    for( i = 0; i < PARALLELISM_DEGREE; ++i )
+      _cryptonite_blake2s_update( S[i], block, BLAKE2S_BLOCKBYTES );
+
+    secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+  }
+
+#if defined(_OPENMP)
+  #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE)
+#else
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+  {
+#if defined(_OPENMP)
+    size_t      i = omp_get_thread_num();
+#endif
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
+    in__ += i * BLAKE2S_BLOCKBYTES;
+
+    while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES )
+    {
+      _cryptonite_blake2s_update( S[i], in__, BLAKE2S_BLOCKBYTES );
+      in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+      inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+    }
+
+    if( inlen__ > i * BLAKE2S_BLOCKBYTES )
+    {
+      const size_t left = inlen__ - i * BLAKE2S_BLOCKBYTES;
+      const size_t len = left <= BLAKE2S_BLOCKBYTES ? left : BLAKE2S_BLOCKBYTES;
+      _cryptonite_blake2s_update( S[i], in__, len );
+    }
+
+    _cryptonite_blake2s_final( S[i], hash[i], BLAKE2S_OUTBYTES );
+  }
+
+  if( blake2sp_init_root( FS, outlen, keylen ) < 0 )
+    return -1;
+
+  FS->last_node = 1;
+
+  for( i = 0; i < PARALLELISM_DEGREE; ++i )
+    _cryptonite_blake2s_update( FS, hash[i], BLAKE2S_OUTBYTES );
+
+  return _cryptonite_blake2s_final( FS, out, outlen );
+}
+
+#if defined(BLAKE2SP_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+  uint8_t key[BLAKE2S_KEYBYTES];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
+  size_t i, step;
+
+  for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
+    key[i] = ( uint8_t )i;
+
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+    buf[i] = ( uint8_t )i;
+
+  /* Test simple API */
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+  {
+    uint8_t hash[BLAKE2S_OUTBYTES];
+    _cryptonite_blake2sp( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
+
+    if( 0 != memcmp( hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+    {
+      goto fail;
+    }
+  }
+
+  /* Test streaming API */
+  for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) {
+    for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+      uint8_t hash[BLAKE2S_OUTBYTES];
+      blake2sp_state S;
+      uint8_t * p = buf;
+      size_t mlen = i;
+      int err = 0;
+
+      if( (err = _cryptonite_blake2sp_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) {
+        goto fail;
+      }
+
+      while (mlen >= step) {
+        if ( (err = _cryptonite_blake2sp_update(&S, p, step)) < 0 ) {
+          goto fail;
+        }
+        mlen -= step;
+        p += step;
+      }
+      if ( (err = _cryptonite_blake2sp_update(&S, p, mlen)) < 0) {
+        goto fail;
+      }
+      if ( (err = _cryptonite_blake2sp_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) {
+        goto fail;
+      }
+
+      if (0 != memcmp(hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES)) {
+        goto fail;
+      }
+    }
+  }
+
+  puts( "ok" );
+  return 0;
+fail:
+  puts("error");
+  return -1;
+}
+#endif
diff --git a/src/Encryption.hs b/src/Encryption.hs
new file mode 100644
index 0000000..106f4e7
--- /dev/null
+++ b/src/Encryption.hs
@@ -0,0 +1,74 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Encryption where
+
+import Control.Monad
+import Crypto.Error
+import Crypto.KDF.Argon2
+import Crypto.Random.Entropy
+import Data.Binary.Get (getWord32be, runGet)
+import Data.Binary.Put (putWord32be, runPut)
+import Data.Bits (xor)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC8
+import qualified Data.ByteString.Lazy as BL
+import Data.Word (Word32)
+import System.CPUTime
+import System.IO
+
+-- picoseconds
+measureIteration = do
+  start <- getCPUTime
+  throwaway <- throwCryptoErrorIO $ hash defaultOptions ("666" :: ByteString) ("solasgsdgdsgsd" :: ByteString) 32 :: IO ByteString
+  end <- seq throwaway getCPUTime
+  pure $ fromIntegral (end - start) / 1e12
+
+getPassword = do
+  tty <- openFile "/dev/tty" ReadWriteMode
+  BC8.hPutStr tty "Password: "
+  hFlush tty
+  old <- hGetEcho tty
+  hSetEcho tty False
+  response <- BC8.hGetLine tty
+  hSetEcho tty old
+  BC8.hPutStrLn tty ""
+  hClose tty
+  pure response
+
+xorWithKey :: ByteString -> ByteString -> ByteString
+xorWithKey key bs
+  | B.null key = error "key must not be empty"
+  | otherwise  = B.pack $ B.zipWith xor bs key
+
+-- format: E[16-byte salt][4-byte big-endian iterations][ciphertext]
+encryptData :: ByteString -> IO ByteString
+encryptData privkey = do
+  iterationTime <- measureIteration
+  print iterationTime
+  let iterations = ceiling $ 1 / iterationTime
+  print iterations
+  salt <- getEntropy 16 :: IO ByteString
+  --let password = "password" :: ByteString
+  password <- getPassword
+  key <- throwCryptoErrorIO $ hash defaultOptions { iterations = iterations } password salt 56 :: IO ByteString
+  let ciphertext = xorWithKey key privkey
+  -- encode iterations as Word32 big-endian
+  let iterWord :: Word32
+      iterWord = fromIntegral iterations
+      iterBytes = BL.toStrict $ runPut (putWord32be iterWord)
+  pure $ B.concat ["E", salt, iterBytes, ciphertext]
+
+decryptData :: ByteString -> IO ByteString
+decryptData contents = do
+  password <- getPassword
+  -- parse salt (16 bytes), iterations (4 bytes BE), then ciphertext
+  when (B.length contents < 76) $ error "encrypted key is unexpectedly short"
+  let (salt, rest) = B.splitAt 16 contents
+      (iterBytes, ciphertext) = B.splitAt 4 rest
+      iterations = runGet getWord32be (BL.fromStrict iterBytes)
+  key <- throwCryptoErrorIO $ hash defaultOptions { iterations = iterations } password salt 56 :: IO ByteString
+  let plaintext = xorWithKey key ciphertext
+  case B.head plaintext of
+    0x53 -> pure plaintext
+    _ -> error "wrong password"
diff --git a/src/Main.hs b/src/Main.hs
index 7f24263..4848314 100644
--- a/src/Main.hs
+++ b/src/Main.hs
@@ -7,16 +7,30 @@ import qualified Data.ByteString as B
 import qualified Data.ByteString.Base64 as B64
 import qualified Data.ByteString.Char8 as BC8
 import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
 import qualified Data.Text.IO as T
 import Network.ONCRPC.XDR.Serial
 import Network.Stellar.TransactionXdr
 import qualified Stellar.Simple as S
 import System.Directory
+import System.Environment
 import System.IO
 
+import Encryption
 import Pretty
 
 main = do
+  args <- getArgs
+  case args of
+    ["encrypt"] -> encrypt
+    _ -> sign
+
+encrypt = do
+  encryptedFile <- T.encodeUtf8 <$> getPrivateKey
+  path <- getPrivateKeyPath
+  B.writeFile path =<< encryptData encryptedFile
+
+sign = do
   transactionB64 <- B.getContents
   let transaction =  case B64.decode $ BC8.strip transactionB64 of
         Left err -> error err
@@ -31,10 +45,17 @@ main = do
     let sd = S.signWithSecret key parsedTransaction
     T.putStrLn $ S.xdrSerializeBase64T sd
 
-getPrivateKey = do
+getPrivateKeyPath :: IO FilePath
+getPrivateKeyPath = do
   home <- getHomeDirectory
-  keyFile <- T.readFile $ home ++ "/.stellar-veritas-key"
-  pure $ T.strip keyFile
+  pure $ home ++ "/.stellar-veritas-key"
+
+getPrivateKey = do
+  file <- B.readFile =<< getPrivateKeyPath
+  case B.head file of
+    0x53 -> pure $ T.strip $ T.decodeUtf8 file	-- bare unencrypted secret key
+    0x45 -> T.decodeUtf8 <$> decryptData (B.tail file)
+    _ -> error "garbled private key file"
 
 confirm :: IO Bool
 confirm = do
diff --git a/stellar-veritas.cabal b/stellar-veritas.cabal
index df90513..6e0af8f 100644
--- a/stellar-veritas.cabal
+++ b/stellar-veritas.cabal
@@ -13,6 +13,7 @@ tested-with:         GHC == 9.12.3
 
 executable stellar-veritas
   main-is:             Main.hs
+  CPP-options:         -DWITH_BYTESTRING_SUPPORT
   ghc-options:         -W -Wcompat -Wno-missing-home-modules -fno-warn-tabs -rtsopts=ignoreAll -optl-Wl,-s
   other-modules:
   build-depends:       base >= 4.9 && < 5,
@@ -46,4 +47,12 @@ executable stellar-veritas
     TypeOperators
     UnliftedFFITypes
   c-sources:          bundled/cbits/ref10/ed25519.c
-  include-dirs:       bundled/cbits/ref10 bundled/cbits/ref10/include
+                      bundled/cbits/argon2/argon2.c
+                      bundled/cbits/blake2/sse/blake2s.c
+                      bundled/cbits/blake2/sse/blake2sp.c
+                      bundled/cbits/blake2/sse/blake2b.c
+                      bundled/cbits/blake2/sse/blake2bp.c
+  include-dirs:       bundled/cbits/ref10
+                      bundled/cbits/ref10/include
+                      bundled/cbits/argon2
+                      bundled/cbits/blake2/sse