Initial commit

2026-01-25 02:27:22 +01:00 · 2026-01-25 02:27:22 +01:00 · c101616e62
commit c101616e62
309 changed files with 53937 additions and 0 deletions
--- a/bundled/Basement/UTF8/Base.hs
+++ b/bundled/Basement/UTF8/Base.hs
@ -0,0 +1,240 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE RebindableSyntax #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+-- |
+-- Module      : Basement.String
+-- License     : BSD-style
+-- Maintainer  : Foundation
+--
+-- A String type backed by a UTF8 encoded byte array and all the necessary
+-- functions to manipulate the string.
+--
+{-# LANGUAGE BangPatterns               #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE MagicHash                  #-}
+{-# LANGUAGE TypeFamilies               #-}
+{-# LANGUAGE FlexibleContexts           #-}
+{-# LANGUAGE CPP                        #-}
+module Basement.UTF8.Base
+    where
+
+import           GHC.ST (ST, runST)
+import           GHC.Types
+import           GHC.Word
+import           GHC.Prim
+import           GHC.Exts (build)
+import           Basement.Compat.Base
+import           Basement.Numerical.Additive
+import           Basement.Compat.Bifunctor
+import           Basement.NormalForm
+import           Basement.Types.OffsetSize
+import           Basement.PrimType
+import           Basement.Monad
+import           Basement.FinalPtr
+import           Basement.UTF8.Helper
+import           Basement.UTF8.Types
+import qualified Basement.Alg.UTF8         as UTF8
+import           Basement.UArray           (UArray)
+import           Basement.Block            (MutableBlock)
+import qualified Basement.Block.Mutable    as BLK
+import qualified Basement.UArray           as Vec
+import qualified Basement.UArray           as C
+import qualified Basement.UArray.Mutable   as MVec
+import           Basement.UArray.Base   as Vec (offset, pureST, onBackend, ValidRange(..), offsetsValidRange)
+import           GHC.CString                        (unpackCString#, unpackCStringUtf8#)
+
+import           Data.Data
+import           Basement.Compat.ExtList as List
+import           Basement.Compat.Semigroup (Semigroup)
+
+-- | Opaque packed array of characters in the UTF8 encoding
+newtype String = String (UArray Word8)
+    deriving (Typeable, Semigroup, Monoid, Eq, Ord)
+
+-- | Mutable String Buffer.
+--
+-- Use as an *append* buffer, as UTF8 variable encoding
+-- doesn't really allow to change previously written
+-- character without potentially shifting bytes.
+newtype MutableString st = MutableString (MVec.MUArray Word8 st)
+    deriving (Typeable)
+
+instance Show String where
+    show = show . sToList
+instance IsString String where
+    fromString = sFromList
+instance IsList String where
+    type Item String = Char
+    fromList = sFromList
+    toList = sToList
+
+instance Data String where
+    toConstr s   = mkConstr stringType (show s) [] Prefix
+    dataTypeOf _ = stringType
+    gunfold _ _  = error "gunfold"
+
+instance NormalForm String where
+    toNormalForm (String ba) = toNormalForm ba
+
+stringType :: DataType
+stringType = mkNoRepType "Foundation.String"
+
+-- | size in bytes.
+--
+-- this size is available in o(1)
+size :: String -> CountOf Word8
+size (String ba) = Vec.length ba
+
+-- | Convert a String to a list of characters
+--
+-- The list is lazily created as evaluation needed
+sToList :: String -> [Char]
+sToList (String arr) = Vec.onBackend onBA onAddr arr
+  where
+    (Vec.ValidRange !start !end) = Vec.offsetsValidRange arr
+    onBA ba@(BLK.Block _) = loop start
+      where
+        loop !idx
+            | idx == end = []
+            | otherwise  = let !(Step c idx') = UTF8.next ba idx in c : loop idx'
+    onAddr fptr ptr@(Ptr _) = pureST (loop start)
+      where
+        loop !idx
+            | idx == end = []
+            | otherwise  = let !(Step c idx') = UTF8.next ptr idx in c : loop idx'
+{-# NOINLINE sToList #-}
+
+sToListStream (String arr) k z = Vec.onBackend onBA onAddr arr
+  where
+    (Vec.ValidRange !start !end) = Vec.offsetsValidRange arr
+    onBA ba@(BLK.Block _) = loop start
+      where
+        loop !idx
+            | idx == end = z
+            | otherwise  = let !(Step c idx') = UTF8.next ba idx in c `k` loop idx'
+    onAddr fptr ptr@(Ptr _) = pureST (loop start)
+      where
+        loop !idx
+            | idx == end = z
+            | otherwise  = let !(Step c idx') = UTF8.next ptr idx in c `k` loop idx'
+
+{-# RULES "String sToList" [~1] forall s . sToList s = build (\ k z -> sToListStream s k z) #-}
+{-# RULES "String toList" [~1] forall s . toList s = build (\ k z -> sToListStream s k z) #-}
+
+{-# RULES "String sFromList" forall s .  sFromList (unpackCString# s) = fromModified s #-}
+{-# RULES "String sFromList" forall s .  sFromList (unpackCStringUtf8# s) = fromModified s #-}
+
+-- | assuming the given Addr# is a valid modified UTF-8 sequence of bytes
+--
+-- We only modify the given Unicode Null-character (0xC080) into a null bytes
+--
+-- FIXME: need to evaluate the kind of modified UTF8 GHC is actually expecting
+-- it is plausible they only handle the Null Bytes, which this function actually
+-- does.
+fromModified :: Addr# -> String
+fromModified addr = countAndCopy 0 0
+  where
+    countAndCopy :: CountOf Word8 -> Offset Word8 -> String
+    countAndCopy count ofs =
+        case primAddrIndex addr ofs of
+            0x00 -> runST $ do
+                        mb <- MVec.newNative_ count (copy count)
+                        String <$> Vec.unsafeFreeze mb
+            0xC0 -> case primAddrIndex addr (ofs+1) of
+                        0x80 -> countAndCopy (count+1) (ofs+2)
+                        _    -> countAndCopy (count+2) (ofs+2)
+            _    -> countAndCopy (count+1) (ofs+1)
+
+    copy :: CountOf Word8 -> MutableBlock Word8 st -> ST st ()
+    copy count mba = loop 0 0
+      where loop o i
+                | o .==# count = pure ()
+                | otherwise    =
+                    case primAddrIndex addr i of
+                        0xC0 -> case primAddrIndex addr (i+1) of
+                                    0x80 -> BLK.unsafeWrite mba o 0x00 >> loop (o+1) (i+2)
+                                    b2   -> BLK.unsafeWrite mba o 0xC0 >> BLK.unsafeWrite mba (o+1) b2 >> loop (o+2) (i+2)
+                        b1   -> BLK.unsafeWrite mba o b1 >> loop (o+1) (i+1)
+
+
+-- | Create a new String from a list of characters
+--
+-- The list is strictly and fully evaluated before
+-- creating the new String, as the size need to be
+-- computed before filling.
+sFromList :: [Char] -> String
+sFromList l = runST (new bytes >>= startCopy)
+  where
+    -- count how many bytes
+    !bytes = List.sum $ fmap (charToBytes . fromEnum) l
+
+    startCopy :: MutableString (PrimState (ST st)) -> ST st String
+    startCopy ms = loop 0 l
+      where
+        loop _   []     = freeze ms
+        loop idx (c:xs) = write ms idx c >>= \idx' -> loop idx' xs
+{-# INLINE [0] sFromList #-}
+
+next :: String -> Offset8 -> Step
+next (String array) !n = Vec.onBackend nextBA nextAddr array
+  where
+    !start = Vec.offset array
+    reoffset (Step a ofs) = Step a (ofs `offsetSub` start)
+    nextBA ba@(BLK.Block _) = reoffset (UTF8.next ba (start + n))
+    nextAddr _ ptr@(Ptr _)  = pureST $ reoffset (UTF8.next ptr (start + n))
+
+prev :: String -> Offset8 -> StepBack
+prev (String array) !n = Vec.onBackend prevBA prevAddr array
+  where
+    !start = Vec.offset array
+    reoffset (StepBack a ofs) = StepBack a (ofs `offsetSub` start)
+    prevBA ba@(BLK.Block _) = reoffset (UTF8.prev ba (start + n))
+    prevAddr _ ptr@(Ptr _)  = pureST $ reoffset (UTF8.prev ptr (start + n))
+
+-- A variant of 'next' when you want the next character
+-- to be ASCII only.
+nextAscii :: String -> Offset8 -> StepASCII
+nextAscii (String ba) n = StepASCII w
+  where
+    !w = Vec.unsafeIndex ba n
+
+expectAscii :: String -> Offset8 -> Word8 -> Bool
+expectAscii (String ba) n v = Vec.unsafeIndex ba n == v
+{-# INLINE expectAscii #-}
+
+write :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> Char -> prim Offset8
+write (MutableString marray) ofs c =
+    MVec.onMutableBackend (\mba@(BLK.MutableBlock _) -> UTF8.writeUTF8 mba (start + ofs) c)
+                          (\fptr -> withFinalPtr fptr $ \ptr@(Ptr _) -> UTF8.writeUTF8 ptr (start + ofs) c)
+                          marray
+  where start = MVec.mutableOffset marray
+
+-- | Allocate a MutableString of a specific size in bytes.
+new :: PrimMonad prim
+    => CountOf Word8 -- ^ in number of bytes, not of elements.
+    -> prim (MutableString (PrimState prim))
+new n = MutableString `fmap` MVec.new n
+
+newNative :: PrimMonad prim
+          => CountOf Word8 -- ^ in number of bytes, not of elements.
+          -> (MutableBlock Word8 (PrimState prim) -> prim a)
+          -> prim (a, MutableString (PrimState prim))
+newNative n f = second MutableString `fmap` MVec.newNative n f
+
+newNative_ :: PrimMonad prim
+           => CountOf Word8 -- ^ in number of bytes, not of elements.
+           -> (MutableBlock Word8 (PrimState prim) -> prim ())
+           -> prim (MutableString (PrimState prim))
+newNative_ n f = MutableString `fmap` MVec.newNative_ n f
+
+freeze :: PrimMonad prim => MutableString (PrimState prim) -> prim String
+freeze (MutableString mba) = String `fmap` C.unsafeFreeze mba
+{-# INLINE freeze #-}
+
+freezeShrink :: PrimMonad prim
+             => CountOf Word8
+             -> MutableString (PrimState prim)
+             -> prim String
+freezeShrink n (MutableString mba) = String `fmap` C.unsafeFreezeShrink mba n
--- a/bundled/Basement/UTF8/Helper.hs
+++ b/bundled/Basement/UTF8/Helper.hs
@ -0,0 +1,251 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE RebindableSyntax #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+-- |
+-- Module      : Basement.UTF8.Helper
+-- License     : BSD-style
+-- Maintainer  : Foundation
+--
+-- Some low level helpers to use UTF8
+--
+-- Most helpers are lowlevel and unsafe, don't use
+-- directly.
+{-# LANGUAGE BangPatterns               #-}
+{-# LANGUAGE MagicHash                  #-}
+{-# LANGUAGE NoImplicitPrelude          #-}
+{-# LANGUAGE TypeFamilies               #-}
+{-# LANGUAGE FlexibleContexts           #-}
+{-# LANGUAGE CPP                        #-}
+module Basement.UTF8.Helper
+    where
+
+import           Basement.Compat.Base
+import           Basement.Compat.Primitive
+import           Basement.Types.OffsetSize
+import           Basement.UTF8.Types
+import           Basement.Bits
+import           GHC.Prim
+import           GHC.Types
+import           GHC.Word
+
+-- mask an UTF8 continuation byte (stripping the leading 10 and returning 6 valid bits)
+maskContinuation# :: Word# -> Word#
+maskContinuation# v = and# v 0x3f##
+{-# INLINE maskContinuation# #-}
+
+-- mask a UTF8 header for 2 bytes encoding (110xxxxx and 5 valid bits)
+maskHeader2# :: Word# -> Word#
+maskHeader2# h = and# h 0x1f##
+{-# INLINE maskHeader2# #-}
+
+-- mask a UTF8 header for 3 bytes encoding (1110xxxx and 4 valid bits)
+maskHeader3# :: Word# -> Word#
+maskHeader3# h = and# h 0xf##
+{-# INLINE maskHeader3# #-}
+
+-- mask a UTF8 header for 4 bytes encoding (11110xxx and 3 valid bits)
+maskHeader4# :: Word# -> Word#
+maskHeader4# h = and# h 0x7##
+{-# INLINE maskHeader4# #-}
+
+or3# :: Word# -> Word# -> Word# -> Word#
+or3# a b c = or# a (or# b c)
+{-# INLINE or3# #-}
+
+or4# :: Word# -> Word# -> Word# -> Word# -> Word#
+or4# a b c d = or# (or# a b) (or# c d)
+{-# INLINE or4# #-}
+
+toChar# :: Word# -> Char
+toChar# w = C# (chr# (word2Int# w))
+{-# INLINE toChar# #-}
+
+toChar1 :: StepASCII -> Char
+toChar1 (StepASCII (W8# w)) = C# (word8ToChar# w)
+
+toChar2 :: StepASCII -> Word8 -> Char
+toChar2 (StepASCII (W8# b1)) (W8# b2) =
+    toChar# (or# (uncheckedShiftL# (maskHeader2# w1) 6#) (maskContinuation# w2))
+  where
+    w1 = word8ToWord# b1
+    w2 = word8ToWord# b2
+
+toChar3 :: StepASCII -> Word8 -> Word8 -> Char
+toChar3 (StepASCII (W8# b1)) (W8# b2) (W8# b3) =
+    toChar# (or3# (uncheckedShiftL# (maskHeader3# w1) 12#)
+                  (uncheckedShiftL# (maskContinuation# w2) 6#)
+                  (maskContinuation# w3)
+            )
+  where
+    w1 = word8ToWord# b1
+    w2 = word8ToWord# b2
+    w3 = word8ToWord# b3
+
+toChar4 :: StepASCII -> Word8 -> Word8 -> Word8 -> Char
+toChar4 (StepASCII (W8# b1)) (W8# b2) (W8# b3) (W8# b4) =
+    toChar# (or4# (uncheckedShiftL# (maskHeader4# w1) 18#)
+                  (uncheckedShiftL# (maskContinuation# w2) 12#)
+                  (uncheckedShiftL# (maskContinuation# w3) 6#)
+                  (maskContinuation# w4)
+            )
+  where
+    w1 = word8ToWord# b1
+    w2 = word8ToWord# b2
+    w3 = word8ToWord# b3
+    w4 = word8ToWord# b4
+
+-- | Different way to encode a Character in UTF8 represented as an ADT
+data UTF8Char =
+      UTF8_1 {-# UNPACK #-} !Word8
+    | UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+    | UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+    | UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
+
+-- | Transform a Unicode code point 'Char' into
+--
+-- note that we expect here a valid unicode code point in the *allowed* range.
+-- bits will be lost if going above 0x10ffff
+asUTF8Char :: Char -> UTF8Char
+asUTF8Char !(C# c)
+  | bool# (ltWord# x 0x80##   ) = encode1
+  | bool# (ltWord# x 0x800##  ) = encode2
+  | bool# (ltWord# x 0x10000##) = encode3
+  | otherwise                   = encode4
+    where
+      !x = int2Word# (ord# c)
+
+      encode1 = UTF8_1 (W8# (wordToWord8# x))
+      encode2 =
+          let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 6#) 0xc0##))
+              !x2 = toContinuation x
+           in UTF8_2 x1 x2
+      encode3 =
+          let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 12#) 0xe0##))
+              !x2 = toContinuation (uncheckedShiftRL# x 6#)
+              !x3 = toContinuation x
+           in UTF8_3 x1 x2 x3
+      encode4 =
+          let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 18#) 0xf0##))
+              !x2 = toContinuation (uncheckedShiftRL# x 12#)
+              !x3 = toContinuation (uncheckedShiftRL# x 6#)
+              !x4 = toContinuation x
+           in UTF8_4 x1 x2 x3 x4
+
+      toContinuation :: Word# -> Word8
+      toContinuation w = W8# (wordToWord8# (or# (and# w 0x3f##) 0x80##))
+      {-# INLINE toContinuation #-}
+
+-- given the encoding of UTF8 Char, get the number of bytes of this sequence
+numBytes :: UTF8Char -> CountOf Word8
+numBytes UTF8_1{} = CountOf 1
+numBytes UTF8_2{} = CountOf 2
+numBytes UTF8_3{} = CountOf 3
+numBytes UTF8_4{} = CountOf 4
+
+-- given the leading byte of a utf8 sequence, get the number of bytes of this sequence
+skipNextHeaderValue :: Word8 -> CountOf Word8
+skipNextHeaderValue !x
+    | x < 0xC0  = CountOf 1 -- 0b11000000
+    | x < 0xE0  = CountOf 2 -- 0b11100000
+    | x < 0xF0  = CountOf 3 -- 0b11110000
+    | otherwise = CountOf 4
+{-# INLINE skipNextHeaderValue #-}
+
+headerIsAscii :: StepASCII -> Bool
+headerIsAscii (StepASCII x) = x < 0x80
+
+charToBytes :: Int -> CountOf Word8
+charToBytes c
+    | c < 0x80     = CountOf 1
+    | c < 0x800    = CountOf 2
+    | c < 0x10000  = CountOf 3
+    | c < 0x110000 = CountOf 4
+    | otherwise    = error ("invalid code point: " `mappend` show c)
+
+-- | Encode a Char into a CharUTF8
+encodeCharUTF8 :: Char -> CharUTF8
+encodeCharUTF8 !(C# c)
+    | bool# (ltWord# x 0x80##   ) = CharUTF8 (W32# (wordToWord32# x))
+    | bool# (ltWord# x 0x800##  ) = CharUTF8 (W32# (wordToWord32# encode2))
+    | bool# (ltWord# x 0x10000##) = CharUTF8 (W32# (wordToWord32# encode3))
+    | otherwise                   = CharUTF8 (W32# (wordToWord32# encode4))
+  where
+    !x = int2Word# (ord# c)
+
+    -- clearing mask, clearing all the bits that need to be clear as per the UTF8 encoding
+    mask2 = 0x0000bfdf## -- 1 continuation , 5 bits header
+    mask3 = 0x00bfbfef## -- 2 continuations, 4 bits header
+    mask4 = 0xbfbfbff7## -- 3 continuations, 3 bits header
+
+    -- setting mask, settings all the bits that need to be set per the UTF8 encoding
+    set2  = 0x000080c0## -- 10xxxxxx     110xxxxx
+    set3  = 0x008080e0## -- 10xxxxxx * 2 1110xxxx
+    set4  = 0x808080f0## -- 10xxxxxx * 3 11111xxx
+
+    encode2 = and# mask2 (or3# set2
+                               (uncheckedShiftRL# x 6#) -- 5 bits to 1st byte
+                               (uncheckedShiftL# x 8# ) -- move lowest bits to the 2nd byte
+                         )
+    encode3 = and# mask3 (or4# set3
+                               (uncheckedShiftRL# x 12#) -- 4 bits to 1st byte
+                               (and# 0x3f00## (uncheckedShiftL# x 2#)) -- 6 bits to the 2nd byte
+                               (uncheckedShiftL# x 16# ) -- move lowest bits to the 3rd byte
+                         )
+    encode4 = and# mask4 (or4# set4
+                               (uncheckedShiftRL# x 18#) -- 3 bits to 1st byte
+                               (or# (and# 0x3f00## (uncheckedShiftRL# x 4#))   -- 6 bits to the 2nd byte
+                                    (and# 0x3f0000## (uncheckedShiftL# x 10#)) -- 6 bits to the 3nd byte
+                               )
+                               (uncheckedShiftL# x 24# ) -- move lowest bits to the 4rd byte
+                         )
+
+-- | decode a CharUTF8 into a Char
+--
+-- If the value inside a CharUTF8 is not properly encoded, this will result in violation
+-- of the Char invariants
+decodeCharUTF8 :: CharUTF8 -> Char
+decodeCharUTF8 c@(CharUTF8 !(W32# w_))
+    | isCharUTF8Case1 c = toChar# w
+    | isCharUTF8Case2 c = encode2
+    | isCharUTF8Case3 c = encode3
+    | otherwise         = encode4
+  where
+    w = word32ToWord# w_
+    encode2 =
+        toChar# (or# (uncheckedShiftL# (maskHeader2# w) 6#)
+                     (maskContinuation# (uncheckedShiftRL# w 8#))
+                )
+    encode3 =
+        toChar# (or3# (uncheckedShiftL# (maskHeader3# w) 12#)
+                      (uncheckedShiftRL# (and# 0x3f00## w) 8#)
+                      (maskContinuation# (uncheckedShiftRL# w 16#))
+                )
+    encode4 =
+        toChar# (or4# (uncheckedShiftL#  (maskHeader4# w) 18#)
+                      (uncheckedShiftRL# (and# 0x3f00## w) 10#)
+                      (uncheckedShiftL#  (and# 0x3f0000## w) 4#)
+                      (maskContinuation# (uncheckedShiftRL# w 24#))
+                )
+
+    -- clearing mask, removing all UTF8 metadata and keeping only signal (content)
+    --maskContent2 = 0x00003f1f## -- 1 continuation , 5 bits header
+    --maskContent3 = 0x003f3f0f## -- 2 continuations, 4 bits header
+    --maskContent4 = 0x3f3f3f07## -- 3 continuations, 3 bits header
+
+isCharUTF8Case1 :: CharUTF8 -> Bool
+isCharUTF8Case1 (CharUTF8 !w) = (w .&. 0x80) == 0
+{-# INLINE isCharUTF8Case1 #-}
+
+isCharUTF8Case2 :: CharUTF8 -> Bool
+isCharUTF8Case2 (CharUTF8 !w) = (w .&. 0x20) == 0
+{-# INLINE isCharUTF8Case2 #-}
+
+isCharUTF8Case3 :: CharUTF8 -> Bool
+isCharUTF8Case3 (CharUTF8 !w) = (w .&. 0x10) == 0
+{-# INLINE isCharUTF8Case3 #-}
+
+isCharUTF8Case4 :: CharUTF8 -> Bool
+isCharUTF8Case4 (CharUTF8 !w) = (w .&. 0x08) == 0
+{-# INLINE isCharUTF8Case4 #-}
--- a/bundled/Basement/UTF8/Table.hs
+++ b/bundled/Basement/UTF8/Table.hs
@ -0,0 +1,121 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE RebindableSyntax #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+-- |
+-- Module      : Basement.UTF8.Table
+-- License     : BSD-style
+-- Maintainer  : Vincent Hanquez <vincent@snarc.org>
+-- Stability   : experimental
+-- Portability : portable
+--
+-- UTF8 lookup tables for fast continuation & nb bytes per header queries
+{-# LANGUAGE MagicHash #-}
+module Basement.UTF8.Table
+    ( isContinuation
+    , isContinuation2
+    , isContinuation3
+    , getNbBytes
+    , isContinuation#
+    , isContinuationW#
+    , getNbBytes#
+    ) where
+
+import           GHC.Prim (Word#, Int#, Addr#, indexWord8OffAddr#, word2Int#)
+import           GHC.Types
+import           GHC.Word
+import           Basement.Compat.Base
+import           Basement.Compat.Primitive
+import           Basement.Bits
+import           Basement.UTF8.Types (StepASCII(..))
+
+-- | Check if the byte is a continuation byte
+isContinuation :: Word8 -> Bool
+isContinuation (W8# w) = isContinuation# w
+{-# INLINE isContinuation #-}
+
+isContinuation2 :: Word8 -> Word8 -> Bool
+isContinuation2 !w1 !w2 = mask w1 && mask w2
+  where
+    mask v = (v .&. 0xC0) == 0x80
+{-# INLINE isContinuation2 #-}
+
+isContinuation3 :: Word8 -> Word8 -> Word8 -> Bool
+isContinuation3 !w1 !w2 !w3 =
+    mask w1 && mask w2 && mask w3
+  where
+    mask v = (v .&. 0xC0) == 0x80
+{-# INLINE isContinuation3 #-}
+
+-- | Number of bytes associated with a specific header byte
+--
+-- If the header byte is invalid then NbBytesInvalid is returned,
+data NbBytesCont = NbBytesInvalid | NbBytesCont0 | NbBytesCont1 | NbBytesCont2 | NbBytesCont3
+
+-- | Identical to 'NbBytesCont' but doesn't allow to represent any failure.
+--
+-- Only use in validated place
+data NbBytesCont_ = NbBytesCont0_ | NbBytesCont1_ | NbBytesCont2_ | NbBytesCont3_
+
+-- | Get the number of following bytes given the first byte of a UTF8 sequence.
+getNbBytes :: StepASCII -> Int
+getNbBytes (StepASCII (W8# w)) = I# (getNbBytes# w)
+{-# INLINE getNbBytes #-}
+
+-- | Check if the byte is a continuation byte
+isContinuation# :: Word8# -> Bool
+isContinuation# w = W8# (indexWord8OffAddr# (unTable contTable) (word2Int# (word8ToWord# w))) == 0
+{-# INLINE isContinuation# #-}
+
+-- | Check if the byte is a continuation byte
+isContinuationW# :: Word# -> Bool
+isContinuationW# w = W8# (indexWord8OffAddr# (unTable contTable) (word2Int# w)) == 0
+{-# INLINE isContinuationW# #-}
+
+-- | Get the number of following bytes given the first byte of a UTF8 sequence.
+getNbBytes# :: Word8# -> Int#
+getNbBytes# w = word8ToInt# (indexWord8OffAddr# (unTable headTable) (word2Int# (word8ToWord# w)))
+{-# INLINE getNbBytes# #-}
+
+data Table = Table { unTable :: !Addr# }
+
+contTable :: Table
+contTable = Table
+        "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"#
+{-# NOINLINE contTable #-}
+
+headTable :: Table
+headTable = Table
+        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
+        \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+        \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+        \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+        \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
+        \\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\
+        \\x03\x03\x03\x03\x03\x03\x03\x03\xff\xff\xff\xff\xff\xff\xff\xff"#
+{-# NOINLINE headTable #-}
--- a/bundled/Basement/UTF8/Types.hs
+++ b/bundled/Basement/UTF8/Types.hs
@ -0,0 +1,73 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE RebindableSyntax #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+module Basement.UTF8.Types
+    (
+    -- * Stepper
+      Step(..)
+    , StepBack(..)
+    , StepASCII(..)
+    , StepDigit(..)
+    , isValidStepASCII
+    , isValidStepDigit
+    -- * Unicode Errors
+    , ValidationFailure(..)
+    -- * UTF8 Encoded 'Char'
+    , CharUTF8(..)
+    -- * Case Conversion
+    , CM (..)
+    ) where
+
+import           Basement.Compat.Base
+import           Basement.Types.OffsetSize
+
+-- | Step when walking a String
+--
+-- this is a return value composed of :
+-- * the unicode code point read (Char) which need to be
+--   between 0 and 0x10ffff (inclusive)
+-- * The next offset to start reading the next unicode code point (or end)
+data Step = Step {-# UNPACK #-} !Char {-# UNPACK #-} !(Offset Word8)
+
+-- | Similar to Step but used when processing the string from the end.
+--
+-- The stepper is thus the previous character, and the offset of
+-- the beginning of the previous character
+data StepBack = StepBack {-# UNPACK #-} !Char {-# UNPACK #-} !(Offset Word8)
+
+-- | Step when processing digits. the value is between 0 and 9 to be valid
+newtype StepDigit = StepDigit Word8
+
+-- | Step when processing ASCII character
+newtype StepASCII = StepASCII { stepAsciiRawValue :: Word8 }
+
+-- | Specialized tuple used for case mapping.
+data CM = CM {-# UNPACK #-} !Char {-# UNPACK #-} !Char {-# UNPACK #-} !Char deriving (Eq)
+
+-- | Represent an already encoded UTF8 Char where the the lowest 8 bits is the start of the
+-- sequence. If this contains a multi bytes sequence then each higher 8 bits are filled with
+-- the remaining sequence 8 bits per 8 bits.
+--
+-- For example:
+-- 'A' => U+0041  => 41          => 0x00000041
+-- '€  => U+20AC  => E2 82 AC    => 0x00AC82E2
+-- '𐍈' => U+10348 => F0 90 8D 88 => 0x888D90F0
+--
+newtype CharUTF8 = CharUTF8 Word32
+
+isValidStepASCII :: StepASCII -> Bool
+isValidStepASCII (StepASCII w) = w < 0x80
+
+isValidStepDigit :: StepDigit -> Bool
+isValidStepDigit (StepDigit w) = w < 0xa
+
+-- | Possible failure related to validating bytes of UTF8 sequences.
+data ValidationFailure = InvalidHeader
+                       | InvalidContinuation
+                       | MissingByte
+                       | BuildingFailure
+                       deriving (Show,Eq,Typeable)
+
+instance Exception ValidationFailure