Initial commit

This commit is contained in:
La Ancapo 2026-01-25 02:27:22 +01:00
commit c101616e62
309 changed files with 53937 additions and 0 deletions

View file

@ -0,0 +1,240 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.String
-- License : BSD-style
-- Maintainer : Foundation
--
-- A String type backed by a UTF8 encoded byte array and all the necessary
-- functions to manipulate the string.
--
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE CPP #-}
module Basement.UTF8.Base
where
import GHC.ST (ST, runST)
import GHC.Types
import GHC.Word
import GHC.Prim
import GHC.Exts (build)
import Basement.Compat.Base
import Basement.Numerical.Additive
import Basement.Compat.Bifunctor
import Basement.NormalForm
import Basement.Types.OffsetSize
import Basement.PrimType
import Basement.Monad
import Basement.FinalPtr
import Basement.UTF8.Helper
import Basement.UTF8.Types
import qualified Basement.Alg.UTF8 as UTF8
import Basement.UArray (UArray)
import Basement.Block (MutableBlock)
import qualified Basement.Block.Mutable as BLK
import qualified Basement.UArray as Vec
import qualified Basement.UArray as C
import qualified Basement.UArray.Mutable as MVec
import Basement.UArray.Base as Vec (offset, pureST, onBackend, ValidRange(..), offsetsValidRange)
import GHC.CString (unpackCString#, unpackCStringUtf8#)
import Data.Data
import Basement.Compat.ExtList as List
import Basement.Compat.Semigroup (Semigroup)
-- | Opaque packed array of characters in the UTF8 encoding
newtype String = String (UArray Word8)
deriving (Typeable, Semigroup, Monoid, Eq, Ord)
-- | Mutable String Buffer.
--
-- Use as an *append* buffer, as UTF8 variable encoding
-- doesn't really allow to change previously written
-- character without potentially shifting bytes.
newtype MutableString st = MutableString (MVec.MUArray Word8 st)
deriving (Typeable)
instance Show String where
show = show . sToList
instance IsString String where
fromString = sFromList
instance IsList String where
type Item String = Char
fromList = sFromList
toList = sToList
instance Data String where
toConstr s = mkConstr stringType (show s) [] Prefix
dataTypeOf _ = stringType
gunfold _ _ = error "gunfold"
instance NormalForm String where
toNormalForm (String ba) = toNormalForm ba
stringType :: DataType
stringType = mkNoRepType "Foundation.String"
-- | size in bytes.
--
-- this size is available in o(1)
size :: String -> CountOf Word8
size (String ba) = Vec.length ba
-- | Convert a String to a list of characters
--
-- The list is lazily created as evaluation needed
sToList :: String -> [Char]
sToList (String arr) = Vec.onBackend onBA onAddr arr
where
(Vec.ValidRange !start !end) = Vec.offsetsValidRange arr
onBA ba@(BLK.Block _) = loop start
where
loop !idx
| idx == end = []
| otherwise = let !(Step c idx') = UTF8.next ba idx in c : loop idx'
onAddr fptr ptr@(Ptr _) = pureST (loop start)
where
loop !idx
| idx == end = []
| otherwise = let !(Step c idx') = UTF8.next ptr idx in c : loop idx'
{-# NOINLINE sToList #-}
sToListStream (String arr) k z = Vec.onBackend onBA onAddr arr
where
(Vec.ValidRange !start !end) = Vec.offsetsValidRange arr
onBA ba@(BLK.Block _) = loop start
where
loop !idx
| idx == end = z
| otherwise = let !(Step c idx') = UTF8.next ba idx in c `k` loop idx'
onAddr fptr ptr@(Ptr _) = pureST (loop start)
where
loop !idx
| idx == end = z
| otherwise = let !(Step c idx') = UTF8.next ptr idx in c `k` loop idx'
{-# RULES "String sToList" [~1] forall s . sToList s = build (\ k z -> sToListStream s k z) #-}
{-# RULES "String toList" [~1] forall s . toList s = build (\ k z -> sToListStream s k z) #-}
{-# RULES "String sFromList" forall s . sFromList (unpackCString# s) = fromModified s #-}
{-# RULES "String sFromList" forall s . sFromList (unpackCStringUtf8# s) = fromModified s #-}
-- | assuming the given Addr# is a valid modified UTF-8 sequence of bytes
--
-- We only modify the given Unicode Null-character (0xC080) into a null bytes
--
-- FIXME: need to evaluate the kind of modified UTF8 GHC is actually expecting
-- it is plausible they only handle the Null Bytes, which this function actually
-- does.
fromModified :: Addr# -> String
fromModified addr = countAndCopy 0 0
where
countAndCopy :: CountOf Word8 -> Offset Word8 -> String
countAndCopy count ofs =
case primAddrIndex addr ofs of
0x00 -> runST $ do
mb <- MVec.newNative_ count (copy count)
String <$> Vec.unsafeFreeze mb
0xC0 -> case primAddrIndex addr (ofs+1) of
0x80 -> countAndCopy (count+1) (ofs+2)
_ -> countAndCopy (count+2) (ofs+2)
_ -> countAndCopy (count+1) (ofs+1)
copy :: CountOf Word8 -> MutableBlock Word8 st -> ST st ()
copy count mba = loop 0 0
where loop o i
| o .==# count = pure ()
| otherwise =
case primAddrIndex addr i of
0xC0 -> case primAddrIndex addr (i+1) of
0x80 -> BLK.unsafeWrite mba o 0x00 >> loop (o+1) (i+2)
b2 -> BLK.unsafeWrite mba o 0xC0 >> BLK.unsafeWrite mba (o+1) b2 >> loop (o+2) (i+2)
b1 -> BLK.unsafeWrite mba o b1 >> loop (o+1) (i+1)
-- | Create a new String from a list of characters
--
-- The list is strictly and fully evaluated before
-- creating the new String, as the size need to be
-- computed before filling.
sFromList :: [Char] -> String
sFromList l = runST (new bytes >>= startCopy)
where
-- count how many bytes
!bytes = List.sum $ fmap (charToBytes . fromEnum) l
startCopy :: MutableString (PrimState (ST st)) -> ST st String
startCopy ms = loop 0 l
where
loop _ [] = freeze ms
loop idx (c:xs) = write ms idx c >>= \idx' -> loop idx' xs
{-# INLINE [0] sFromList #-}
next :: String -> Offset8 -> Step
next (String array) !n = Vec.onBackend nextBA nextAddr array
where
!start = Vec.offset array
reoffset (Step a ofs) = Step a (ofs `offsetSub` start)
nextBA ba@(BLK.Block _) = reoffset (UTF8.next ba (start + n))
nextAddr _ ptr@(Ptr _) = pureST $ reoffset (UTF8.next ptr (start + n))
prev :: String -> Offset8 -> StepBack
prev (String array) !n = Vec.onBackend prevBA prevAddr array
where
!start = Vec.offset array
reoffset (StepBack a ofs) = StepBack a (ofs `offsetSub` start)
prevBA ba@(BLK.Block _) = reoffset (UTF8.prev ba (start + n))
prevAddr _ ptr@(Ptr _) = pureST $ reoffset (UTF8.prev ptr (start + n))
-- A variant of 'next' when you want the next character
-- to be ASCII only.
nextAscii :: String -> Offset8 -> StepASCII
nextAscii (String ba) n = StepASCII w
where
!w = Vec.unsafeIndex ba n
expectAscii :: String -> Offset8 -> Word8 -> Bool
expectAscii (String ba) n v = Vec.unsafeIndex ba n == v
{-# INLINE expectAscii #-}
write :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> Char -> prim Offset8
write (MutableString marray) ofs c =
MVec.onMutableBackend (\mba@(BLK.MutableBlock _) -> UTF8.writeUTF8 mba (start + ofs) c)
(\fptr -> withFinalPtr fptr $ \ptr@(Ptr _) -> UTF8.writeUTF8 ptr (start + ofs) c)
marray
where start = MVec.mutableOffset marray
-- | Allocate a MutableString of a specific size in bytes.
new :: PrimMonad prim
=> CountOf Word8 -- ^ in number of bytes, not of elements.
-> prim (MutableString (PrimState prim))
new n = MutableString `fmap` MVec.new n
newNative :: PrimMonad prim
=> CountOf Word8 -- ^ in number of bytes, not of elements.
-> (MutableBlock Word8 (PrimState prim) -> prim a)
-> prim (a, MutableString (PrimState prim))
newNative n f = second MutableString `fmap` MVec.newNative n f
newNative_ :: PrimMonad prim
=> CountOf Word8 -- ^ in number of bytes, not of elements.
-> (MutableBlock Word8 (PrimState prim) -> prim ())
-> prim (MutableString (PrimState prim))
newNative_ n f = MutableString `fmap` MVec.newNative_ n f
freeze :: PrimMonad prim => MutableString (PrimState prim) -> prim String
freeze (MutableString mba) = String `fmap` C.unsafeFreeze mba
{-# INLINE freeze #-}
freezeShrink :: PrimMonad prim
=> CountOf Word8
-> MutableString (PrimState prim)
-> prim String
freezeShrink n (MutableString mba) = String `fmap` C.unsafeFreezeShrink mba n

View file

@ -0,0 +1,251 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.UTF8.Helper
-- License : BSD-style
-- Maintainer : Foundation
--
-- Some low level helpers to use UTF8
--
-- Most helpers are lowlevel and unsafe, don't use
-- directly.
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE CPP #-}
module Basement.UTF8.Helper
where
import Basement.Compat.Base
import Basement.Compat.Primitive
import Basement.Types.OffsetSize
import Basement.UTF8.Types
import Basement.Bits
import GHC.Prim
import GHC.Types
import GHC.Word
-- mask an UTF8 continuation byte (stripping the leading 10 and returning 6 valid bits)
maskContinuation# :: Word# -> Word#
maskContinuation# v = and# v 0x3f##
{-# INLINE maskContinuation# #-}
-- mask a UTF8 header for 2 bytes encoding (110xxxxx and 5 valid bits)
maskHeader2# :: Word# -> Word#
maskHeader2# h = and# h 0x1f##
{-# INLINE maskHeader2# #-}
-- mask a UTF8 header for 3 bytes encoding (1110xxxx and 4 valid bits)
maskHeader3# :: Word# -> Word#
maskHeader3# h = and# h 0xf##
{-# INLINE maskHeader3# #-}
-- mask a UTF8 header for 4 bytes encoding (11110xxx and 3 valid bits)
maskHeader4# :: Word# -> Word#
maskHeader4# h = and# h 0x7##
{-# INLINE maskHeader4# #-}
or3# :: Word# -> Word# -> Word# -> Word#
or3# a b c = or# a (or# b c)
{-# INLINE or3# #-}
or4# :: Word# -> Word# -> Word# -> Word# -> Word#
or4# a b c d = or# (or# a b) (or# c d)
{-# INLINE or4# #-}
toChar# :: Word# -> Char
toChar# w = C# (chr# (word2Int# w))
{-# INLINE toChar# #-}
toChar1 :: StepASCII -> Char
toChar1 (StepASCII (W8# w)) = C# (word8ToChar# w)
toChar2 :: StepASCII -> Word8 -> Char
toChar2 (StepASCII (W8# b1)) (W8# b2) =
toChar# (or# (uncheckedShiftL# (maskHeader2# w1) 6#) (maskContinuation# w2))
where
w1 = word8ToWord# b1
w2 = word8ToWord# b2
toChar3 :: StepASCII -> Word8 -> Word8 -> Char
toChar3 (StepASCII (W8# b1)) (W8# b2) (W8# b3) =
toChar# (or3# (uncheckedShiftL# (maskHeader3# w1) 12#)
(uncheckedShiftL# (maskContinuation# w2) 6#)
(maskContinuation# w3)
)
where
w1 = word8ToWord# b1
w2 = word8ToWord# b2
w3 = word8ToWord# b3
toChar4 :: StepASCII -> Word8 -> Word8 -> Word8 -> Char
toChar4 (StepASCII (W8# b1)) (W8# b2) (W8# b3) (W8# b4) =
toChar# (or4# (uncheckedShiftL# (maskHeader4# w1) 18#)
(uncheckedShiftL# (maskContinuation# w2) 12#)
(uncheckedShiftL# (maskContinuation# w3) 6#)
(maskContinuation# w4)
)
where
w1 = word8ToWord# b1
w2 = word8ToWord# b2
w3 = word8ToWord# b3
w4 = word8ToWord# b4
-- | Different way to encode a Character in UTF8 represented as an ADT
data UTF8Char =
UTF8_1 {-# UNPACK #-} !Word8
| UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
| UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
| UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
-- | Transform a Unicode code point 'Char' into
--
-- note that we expect here a valid unicode code point in the *allowed* range.
-- bits will be lost if going above 0x10ffff
asUTF8Char :: Char -> UTF8Char
asUTF8Char !(C# c)
| bool# (ltWord# x 0x80## ) = encode1
| bool# (ltWord# x 0x800## ) = encode2
| bool# (ltWord# x 0x10000##) = encode3
| otherwise = encode4
where
!x = int2Word# (ord# c)
encode1 = UTF8_1 (W8# (wordToWord8# x))
encode2 =
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 6#) 0xc0##))
!x2 = toContinuation x
in UTF8_2 x1 x2
encode3 =
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 12#) 0xe0##))
!x2 = toContinuation (uncheckedShiftRL# x 6#)
!x3 = toContinuation x
in UTF8_3 x1 x2 x3
encode4 =
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 18#) 0xf0##))
!x2 = toContinuation (uncheckedShiftRL# x 12#)
!x3 = toContinuation (uncheckedShiftRL# x 6#)
!x4 = toContinuation x
in UTF8_4 x1 x2 x3 x4
toContinuation :: Word# -> Word8
toContinuation w = W8# (wordToWord8# (or# (and# w 0x3f##) 0x80##))
{-# INLINE toContinuation #-}
-- given the encoding of UTF8 Char, get the number of bytes of this sequence
numBytes :: UTF8Char -> CountOf Word8
numBytes UTF8_1{} = CountOf 1
numBytes UTF8_2{} = CountOf 2
numBytes UTF8_3{} = CountOf 3
numBytes UTF8_4{} = CountOf 4
-- given the leading byte of a utf8 sequence, get the number of bytes of this sequence
skipNextHeaderValue :: Word8 -> CountOf Word8
skipNextHeaderValue !x
| x < 0xC0 = CountOf 1 -- 0b11000000
| x < 0xE0 = CountOf 2 -- 0b11100000
| x < 0xF0 = CountOf 3 -- 0b11110000
| otherwise = CountOf 4
{-# INLINE skipNextHeaderValue #-}
headerIsAscii :: StepASCII -> Bool
headerIsAscii (StepASCII x) = x < 0x80
charToBytes :: Int -> CountOf Word8
charToBytes c
| c < 0x80 = CountOf 1
| c < 0x800 = CountOf 2
| c < 0x10000 = CountOf 3
| c < 0x110000 = CountOf 4
| otherwise = error ("invalid code point: " `mappend` show c)
-- | Encode a Char into a CharUTF8
encodeCharUTF8 :: Char -> CharUTF8
encodeCharUTF8 !(C# c)
| bool# (ltWord# x 0x80## ) = CharUTF8 (W32# (wordToWord32# x))
| bool# (ltWord# x 0x800## ) = CharUTF8 (W32# (wordToWord32# encode2))
| bool# (ltWord# x 0x10000##) = CharUTF8 (W32# (wordToWord32# encode3))
| otherwise = CharUTF8 (W32# (wordToWord32# encode4))
where
!x = int2Word# (ord# c)
-- clearing mask, clearing all the bits that need to be clear as per the UTF8 encoding
mask2 = 0x0000bfdf## -- 1 continuation , 5 bits header
mask3 = 0x00bfbfef## -- 2 continuations, 4 bits header
mask4 = 0xbfbfbff7## -- 3 continuations, 3 bits header
-- setting mask, settings all the bits that need to be set per the UTF8 encoding
set2 = 0x000080c0## -- 10xxxxxx 110xxxxx
set3 = 0x008080e0## -- 10xxxxxx * 2 1110xxxx
set4 = 0x808080f0## -- 10xxxxxx * 3 11111xxx
encode2 = and# mask2 (or3# set2
(uncheckedShiftRL# x 6#) -- 5 bits to 1st byte
(uncheckedShiftL# x 8# ) -- move lowest bits to the 2nd byte
)
encode3 = and# mask3 (or4# set3
(uncheckedShiftRL# x 12#) -- 4 bits to 1st byte
(and# 0x3f00## (uncheckedShiftL# x 2#)) -- 6 bits to the 2nd byte
(uncheckedShiftL# x 16# ) -- move lowest bits to the 3rd byte
)
encode4 = and# mask4 (or4# set4
(uncheckedShiftRL# x 18#) -- 3 bits to 1st byte
(or# (and# 0x3f00## (uncheckedShiftRL# x 4#)) -- 6 bits to the 2nd byte
(and# 0x3f0000## (uncheckedShiftL# x 10#)) -- 6 bits to the 3nd byte
)
(uncheckedShiftL# x 24# ) -- move lowest bits to the 4rd byte
)
-- | decode a CharUTF8 into a Char
--
-- If the value inside a CharUTF8 is not properly encoded, this will result in violation
-- of the Char invariants
decodeCharUTF8 :: CharUTF8 -> Char
decodeCharUTF8 c@(CharUTF8 !(W32# w_))
| isCharUTF8Case1 c = toChar# w
| isCharUTF8Case2 c = encode2
| isCharUTF8Case3 c = encode3
| otherwise = encode4
where
w = word32ToWord# w_
encode2 =
toChar# (or# (uncheckedShiftL# (maskHeader2# w) 6#)
(maskContinuation# (uncheckedShiftRL# w 8#))
)
encode3 =
toChar# (or3# (uncheckedShiftL# (maskHeader3# w) 12#)
(uncheckedShiftRL# (and# 0x3f00## w) 8#)
(maskContinuation# (uncheckedShiftRL# w 16#))
)
encode4 =
toChar# (or4# (uncheckedShiftL# (maskHeader4# w) 18#)
(uncheckedShiftRL# (and# 0x3f00## w) 10#)
(uncheckedShiftL# (and# 0x3f0000## w) 4#)
(maskContinuation# (uncheckedShiftRL# w 24#))
)
-- clearing mask, removing all UTF8 metadata and keeping only signal (content)
--maskContent2 = 0x00003f1f## -- 1 continuation , 5 bits header
--maskContent3 = 0x003f3f0f## -- 2 continuations, 4 bits header
--maskContent4 = 0x3f3f3f07## -- 3 continuations, 3 bits header
isCharUTF8Case1 :: CharUTF8 -> Bool
isCharUTF8Case1 (CharUTF8 !w) = (w .&. 0x80) == 0
{-# INLINE isCharUTF8Case1 #-}
isCharUTF8Case2 :: CharUTF8 -> Bool
isCharUTF8Case2 (CharUTF8 !w) = (w .&. 0x20) == 0
{-# INLINE isCharUTF8Case2 #-}
isCharUTF8Case3 :: CharUTF8 -> Bool
isCharUTF8Case3 (CharUTF8 !w) = (w .&. 0x10) == 0
{-# INLINE isCharUTF8Case3 #-}
isCharUTF8Case4 :: CharUTF8 -> Bool
isCharUTF8Case4 (CharUTF8 !w) = (w .&. 0x08) == 0
{-# INLINE isCharUTF8Case4 #-}

View file

@ -0,0 +1,121 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.UTF8.Table
-- License : BSD-style
-- Maintainer : Vincent Hanquez <vincent@snarc.org>
-- Stability : experimental
-- Portability : portable
--
-- UTF8 lookup tables for fast continuation & nb bytes per header queries
{-# LANGUAGE MagicHash #-}
module Basement.UTF8.Table
( isContinuation
, isContinuation2
, isContinuation3
, getNbBytes
, isContinuation#
, isContinuationW#
, getNbBytes#
) where
import GHC.Prim (Word#, Int#, Addr#, indexWord8OffAddr#, word2Int#)
import GHC.Types
import GHC.Word
import Basement.Compat.Base
import Basement.Compat.Primitive
import Basement.Bits
import Basement.UTF8.Types (StepASCII(..))
-- | Check if the byte is a continuation byte
isContinuation :: Word8 -> Bool
isContinuation (W8# w) = isContinuation# w
{-# INLINE isContinuation #-}
isContinuation2 :: Word8 -> Word8 -> Bool
isContinuation2 !w1 !w2 = mask w1 && mask w2
where
mask v = (v .&. 0xC0) == 0x80
{-# INLINE isContinuation2 #-}
isContinuation3 :: Word8 -> Word8 -> Word8 -> Bool
isContinuation3 !w1 !w2 !w3 =
mask w1 && mask w2 && mask w3
where
mask v = (v .&. 0xC0) == 0x80
{-# INLINE isContinuation3 #-}
-- | Number of bytes associated with a specific header byte
--
-- If the header byte is invalid then NbBytesInvalid is returned,
data NbBytesCont = NbBytesInvalid | NbBytesCont0 | NbBytesCont1 | NbBytesCont2 | NbBytesCont3
-- | Identical to 'NbBytesCont' but doesn't allow to represent any failure.
--
-- Only use in validated place
data NbBytesCont_ = NbBytesCont0_ | NbBytesCont1_ | NbBytesCont2_ | NbBytesCont3_
-- | Get the number of following bytes given the first byte of a UTF8 sequence.
getNbBytes :: StepASCII -> Int
getNbBytes (StepASCII (W8# w)) = I# (getNbBytes# w)
{-# INLINE getNbBytes #-}
-- | Check if the byte is a continuation byte
isContinuation# :: Word8# -> Bool
isContinuation# w = W8# (indexWord8OffAddr# (unTable contTable) (word2Int# (word8ToWord# w))) == 0
{-# INLINE isContinuation# #-}
-- | Check if the byte is a continuation byte
isContinuationW# :: Word# -> Bool
isContinuationW# w = W8# (indexWord8OffAddr# (unTable contTable) (word2Int# w)) == 0
{-# INLINE isContinuationW# #-}
-- | Get the number of following bytes given the first byte of a UTF8 sequence.
getNbBytes# :: Word8# -> Int#
getNbBytes# w = word8ToInt# (indexWord8OffAddr# (unTable headTable) (word2Int# (word8ToWord# w)))
{-# INLINE getNbBytes# #-}
data Table = Table { unTable :: !Addr# }
contTable :: Table
contTable = Table
"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"#
{-# NOINLINE contTable #-}
headTable :: Table
headTable = Table
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
\\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\
\\x03\x03\x03\x03\x03\x03\x03\x03\xff\xff\xff\xff\xff\xff\xff\xff"#
{-# NOINLINE headTable #-}

View file

@ -0,0 +1,73 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Basement.UTF8.Types
(
-- * Stepper
Step(..)
, StepBack(..)
, StepASCII(..)
, StepDigit(..)
, isValidStepASCII
, isValidStepDigit
-- * Unicode Errors
, ValidationFailure(..)
-- * UTF8 Encoded 'Char'
, CharUTF8(..)
-- * Case Conversion
, CM (..)
) where
import Basement.Compat.Base
import Basement.Types.OffsetSize
-- | Step when walking a String
--
-- this is a return value composed of :
-- * the unicode code point read (Char) which need to be
-- between 0 and 0x10ffff (inclusive)
-- * The next offset to start reading the next unicode code point (or end)
data Step = Step {-# UNPACK #-} !Char {-# UNPACK #-} !(Offset Word8)
-- | Similar to Step but used when processing the string from the end.
--
-- The stepper is thus the previous character, and the offset of
-- the beginning of the previous character
data StepBack = StepBack {-# UNPACK #-} !Char {-# UNPACK #-} !(Offset Word8)
-- | Step when processing digits. the value is between 0 and 9 to be valid
newtype StepDigit = StepDigit Word8
-- | Step when processing ASCII character
newtype StepASCII = StepASCII { stepAsciiRawValue :: Word8 }
-- | Specialized tuple used for case mapping.
data CM = CM {-# UNPACK #-} !Char {-# UNPACK #-} !Char {-# UNPACK #-} !Char deriving (Eq)
-- | Represent an already encoded UTF8 Char where the the lowest 8 bits is the start of the
-- sequence. If this contains a multi bytes sequence then each higher 8 bits are filled with
-- the remaining sequence 8 bits per 8 bits.
--
-- For example:
-- 'A' => U+0041 => 41 => 0x00000041
-- '€ => U+20AC => E2 82 AC => 0x00AC82E2
-- '𐍈' => U+10348 => F0 90 8D 88 => 0x888D90F0
--
newtype CharUTF8 = CharUTF8 Word32
isValidStepASCII :: StepASCII -> Bool
isValidStepASCII (StepASCII w) = w < 0x80
isValidStepDigit :: StepDigit -> Bool
isValidStepDigit (StepDigit w) = w < 0xa
-- | Possible failure related to validating bytes of UTF8 sequences.
data ValidationFailure = InvalidHeader
| InvalidContinuation
| MissingByte
| BuildingFailure
deriving (Show,Eq,Typeable)
instance Exception ValidationFailure