Initial commit
This commit is contained in:
commit
c101616e62
309 changed files with 53937 additions and 0 deletions
240
bundled/Basement/UTF8/Base.hs
Normal file
240
bundled/Basement/UTF8/Base.hs
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.String
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Foundation
|
||||
--
|
||||
-- A String type backed by a UTF8 encoded byte array and all the necessary
|
||||
-- functions to manipulate the string.
|
||||
--
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
{-# LANGUAGE MagicHash #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE CPP #-}
|
||||
module Basement.UTF8.Base
|
||||
where
|
||||
|
||||
import GHC.ST (ST, runST)
|
||||
import GHC.Types
|
||||
import GHC.Word
|
||||
import GHC.Prim
|
||||
import GHC.Exts (build)
|
||||
import Basement.Compat.Base
|
||||
import Basement.Numerical.Additive
|
||||
import Basement.Compat.Bifunctor
|
||||
import Basement.NormalForm
|
||||
import Basement.Types.OffsetSize
|
||||
import Basement.PrimType
|
||||
import Basement.Monad
|
||||
import Basement.FinalPtr
|
||||
import Basement.UTF8.Helper
|
||||
import Basement.UTF8.Types
|
||||
import qualified Basement.Alg.UTF8 as UTF8
|
||||
import Basement.UArray (UArray)
|
||||
import Basement.Block (MutableBlock)
|
||||
import qualified Basement.Block.Mutable as BLK
|
||||
import qualified Basement.UArray as Vec
|
||||
import qualified Basement.UArray as C
|
||||
import qualified Basement.UArray.Mutable as MVec
|
||||
import Basement.UArray.Base as Vec (offset, pureST, onBackend, ValidRange(..), offsetsValidRange)
|
||||
import GHC.CString (unpackCString#, unpackCStringUtf8#)
|
||||
|
||||
import Data.Data
|
||||
import Basement.Compat.ExtList as List
|
||||
import Basement.Compat.Semigroup (Semigroup)
|
||||
|
||||
-- | Opaque packed array of characters in the UTF8 encoding
|
||||
newtype String = String (UArray Word8)
|
||||
deriving (Typeable, Semigroup, Monoid, Eq, Ord)
|
||||
|
||||
-- | Mutable String Buffer.
|
||||
--
|
||||
-- Use as an *append* buffer, as UTF8 variable encoding
|
||||
-- doesn't really allow to change previously written
|
||||
-- character without potentially shifting bytes.
|
||||
newtype MutableString st = MutableString (MVec.MUArray Word8 st)
|
||||
deriving (Typeable)
|
||||
|
||||
instance Show String where
|
||||
show = show . sToList
|
||||
instance IsString String where
|
||||
fromString = sFromList
|
||||
instance IsList String where
|
||||
type Item String = Char
|
||||
fromList = sFromList
|
||||
toList = sToList
|
||||
|
||||
instance Data String where
|
||||
toConstr s = mkConstr stringType (show s) [] Prefix
|
||||
dataTypeOf _ = stringType
|
||||
gunfold _ _ = error "gunfold"
|
||||
|
||||
instance NormalForm String where
|
||||
toNormalForm (String ba) = toNormalForm ba
|
||||
|
||||
stringType :: DataType
|
||||
stringType = mkNoRepType "Foundation.String"
|
||||
|
||||
-- | size in bytes.
|
||||
--
|
||||
-- this size is available in o(1)
|
||||
size :: String -> CountOf Word8
|
||||
size (String ba) = Vec.length ba
|
||||
|
||||
-- | Convert a String to a list of characters
|
||||
--
|
||||
-- The list is lazily created as evaluation needed
|
||||
sToList :: String -> [Char]
|
||||
sToList (String arr) = Vec.onBackend onBA onAddr arr
|
||||
where
|
||||
(Vec.ValidRange !start !end) = Vec.offsetsValidRange arr
|
||||
onBA ba@(BLK.Block _) = loop start
|
||||
where
|
||||
loop !idx
|
||||
| idx == end = []
|
||||
| otherwise = let !(Step c idx') = UTF8.next ba idx in c : loop idx'
|
||||
onAddr fptr ptr@(Ptr _) = pureST (loop start)
|
||||
where
|
||||
loop !idx
|
||||
| idx == end = []
|
||||
| otherwise = let !(Step c idx') = UTF8.next ptr idx in c : loop idx'
|
||||
{-# NOINLINE sToList #-}
|
||||
|
||||
sToListStream (String arr) k z = Vec.onBackend onBA onAddr arr
|
||||
where
|
||||
(Vec.ValidRange !start !end) = Vec.offsetsValidRange arr
|
||||
onBA ba@(BLK.Block _) = loop start
|
||||
where
|
||||
loop !idx
|
||||
| idx == end = z
|
||||
| otherwise = let !(Step c idx') = UTF8.next ba idx in c `k` loop idx'
|
||||
onAddr fptr ptr@(Ptr _) = pureST (loop start)
|
||||
where
|
||||
loop !idx
|
||||
| idx == end = z
|
||||
| otherwise = let !(Step c idx') = UTF8.next ptr idx in c `k` loop idx'
|
||||
|
||||
{-# RULES "String sToList" [~1] forall s . sToList s = build (\ k z -> sToListStream s k z) #-}
|
||||
{-# RULES "String toList" [~1] forall s . toList s = build (\ k z -> sToListStream s k z) #-}
|
||||
|
||||
{-# RULES "String sFromList" forall s . sFromList (unpackCString# s) = fromModified s #-}
|
||||
{-# RULES "String sFromList" forall s . sFromList (unpackCStringUtf8# s) = fromModified s #-}
|
||||
|
||||
-- | assuming the given Addr# is a valid modified UTF-8 sequence of bytes
|
||||
--
|
||||
-- We only modify the given Unicode Null-character (0xC080) into a null bytes
|
||||
--
|
||||
-- FIXME: need to evaluate the kind of modified UTF8 GHC is actually expecting
|
||||
-- it is plausible they only handle the Null Bytes, which this function actually
|
||||
-- does.
|
||||
fromModified :: Addr# -> String
|
||||
fromModified addr = countAndCopy 0 0
|
||||
where
|
||||
countAndCopy :: CountOf Word8 -> Offset Word8 -> String
|
||||
countAndCopy count ofs =
|
||||
case primAddrIndex addr ofs of
|
||||
0x00 -> runST $ do
|
||||
mb <- MVec.newNative_ count (copy count)
|
||||
String <$> Vec.unsafeFreeze mb
|
||||
0xC0 -> case primAddrIndex addr (ofs+1) of
|
||||
0x80 -> countAndCopy (count+1) (ofs+2)
|
||||
_ -> countAndCopy (count+2) (ofs+2)
|
||||
_ -> countAndCopy (count+1) (ofs+1)
|
||||
|
||||
copy :: CountOf Word8 -> MutableBlock Word8 st -> ST st ()
|
||||
copy count mba = loop 0 0
|
||||
where loop o i
|
||||
| o .==# count = pure ()
|
||||
| otherwise =
|
||||
case primAddrIndex addr i of
|
||||
0xC0 -> case primAddrIndex addr (i+1) of
|
||||
0x80 -> BLK.unsafeWrite mba o 0x00 >> loop (o+1) (i+2)
|
||||
b2 -> BLK.unsafeWrite mba o 0xC0 >> BLK.unsafeWrite mba (o+1) b2 >> loop (o+2) (i+2)
|
||||
b1 -> BLK.unsafeWrite mba o b1 >> loop (o+1) (i+1)
|
||||
|
||||
|
||||
-- | Create a new String from a list of characters
|
||||
--
|
||||
-- The list is strictly and fully evaluated before
|
||||
-- creating the new String, as the size need to be
|
||||
-- computed before filling.
|
||||
sFromList :: [Char] -> String
|
||||
sFromList l = runST (new bytes >>= startCopy)
|
||||
where
|
||||
-- count how many bytes
|
||||
!bytes = List.sum $ fmap (charToBytes . fromEnum) l
|
||||
|
||||
startCopy :: MutableString (PrimState (ST st)) -> ST st String
|
||||
startCopy ms = loop 0 l
|
||||
where
|
||||
loop _ [] = freeze ms
|
||||
loop idx (c:xs) = write ms idx c >>= \idx' -> loop idx' xs
|
||||
{-# INLINE [0] sFromList #-}
|
||||
|
||||
next :: String -> Offset8 -> Step
|
||||
next (String array) !n = Vec.onBackend nextBA nextAddr array
|
||||
where
|
||||
!start = Vec.offset array
|
||||
reoffset (Step a ofs) = Step a (ofs `offsetSub` start)
|
||||
nextBA ba@(BLK.Block _) = reoffset (UTF8.next ba (start + n))
|
||||
nextAddr _ ptr@(Ptr _) = pureST $ reoffset (UTF8.next ptr (start + n))
|
||||
|
||||
prev :: String -> Offset8 -> StepBack
|
||||
prev (String array) !n = Vec.onBackend prevBA prevAddr array
|
||||
where
|
||||
!start = Vec.offset array
|
||||
reoffset (StepBack a ofs) = StepBack a (ofs `offsetSub` start)
|
||||
prevBA ba@(BLK.Block _) = reoffset (UTF8.prev ba (start + n))
|
||||
prevAddr _ ptr@(Ptr _) = pureST $ reoffset (UTF8.prev ptr (start + n))
|
||||
|
||||
-- A variant of 'next' when you want the next character
|
||||
-- to be ASCII only.
|
||||
nextAscii :: String -> Offset8 -> StepASCII
|
||||
nextAscii (String ba) n = StepASCII w
|
||||
where
|
||||
!w = Vec.unsafeIndex ba n
|
||||
|
||||
expectAscii :: String -> Offset8 -> Word8 -> Bool
|
||||
expectAscii (String ba) n v = Vec.unsafeIndex ba n == v
|
||||
{-# INLINE expectAscii #-}
|
||||
|
||||
write :: PrimMonad prim => MutableString (PrimState prim) -> Offset8 -> Char -> prim Offset8
|
||||
write (MutableString marray) ofs c =
|
||||
MVec.onMutableBackend (\mba@(BLK.MutableBlock _) -> UTF8.writeUTF8 mba (start + ofs) c)
|
||||
(\fptr -> withFinalPtr fptr $ \ptr@(Ptr _) -> UTF8.writeUTF8 ptr (start + ofs) c)
|
||||
marray
|
||||
where start = MVec.mutableOffset marray
|
||||
|
||||
-- | Allocate a MutableString of a specific size in bytes.
|
||||
new :: PrimMonad prim
|
||||
=> CountOf Word8 -- ^ in number of bytes, not of elements.
|
||||
-> prim (MutableString (PrimState prim))
|
||||
new n = MutableString `fmap` MVec.new n
|
||||
|
||||
newNative :: PrimMonad prim
|
||||
=> CountOf Word8 -- ^ in number of bytes, not of elements.
|
||||
-> (MutableBlock Word8 (PrimState prim) -> prim a)
|
||||
-> prim (a, MutableString (PrimState prim))
|
||||
newNative n f = second MutableString `fmap` MVec.newNative n f
|
||||
|
||||
newNative_ :: PrimMonad prim
|
||||
=> CountOf Word8 -- ^ in number of bytes, not of elements.
|
||||
-> (MutableBlock Word8 (PrimState prim) -> prim ())
|
||||
-> prim (MutableString (PrimState prim))
|
||||
newNative_ n f = MutableString `fmap` MVec.newNative_ n f
|
||||
|
||||
freeze :: PrimMonad prim => MutableString (PrimState prim) -> prim String
|
||||
freeze (MutableString mba) = String `fmap` C.unsafeFreeze mba
|
||||
{-# INLINE freeze #-}
|
||||
|
||||
freezeShrink :: PrimMonad prim
|
||||
=> CountOf Word8
|
||||
-> MutableString (PrimState prim)
|
||||
-> prim String
|
||||
freezeShrink n (MutableString mba) = String `fmap` C.unsafeFreezeShrink mba n
|
||||
251
bundled/Basement/UTF8/Helper.hs
Normal file
251
bundled/Basement/UTF8/Helper.hs
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.UTF8.Helper
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Foundation
|
||||
--
|
||||
-- Some low level helpers to use UTF8
|
||||
--
|
||||
-- Most helpers are lowlevel and unsafe, don't use
|
||||
-- directly.
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE MagicHash #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE CPP #-}
|
||||
module Basement.UTF8.Helper
|
||||
where
|
||||
|
||||
import Basement.Compat.Base
|
||||
import Basement.Compat.Primitive
|
||||
import Basement.Types.OffsetSize
|
||||
import Basement.UTF8.Types
|
||||
import Basement.Bits
|
||||
import GHC.Prim
|
||||
import GHC.Types
|
||||
import GHC.Word
|
||||
|
||||
-- mask an UTF8 continuation byte (stripping the leading 10 and returning 6 valid bits)
|
||||
maskContinuation# :: Word# -> Word#
|
||||
maskContinuation# v = and# v 0x3f##
|
||||
{-# INLINE maskContinuation# #-}
|
||||
|
||||
-- mask a UTF8 header for 2 bytes encoding (110xxxxx and 5 valid bits)
|
||||
maskHeader2# :: Word# -> Word#
|
||||
maskHeader2# h = and# h 0x1f##
|
||||
{-# INLINE maskHeader2# #-}
|
||||
|
||||
-- mask a UTF8 header for 3 bytes encoding (1110xxxx and 4 valid bits)
|
||||
maskHeader3# :: Word# -> Word#
|
||||
maskHeader3# h = and# h 0xf##
|
||||
{-# INLINE maskHeader3# #-}
|
||||
|
||||
-- mask a UTF8 header for 4 bytes encoding (11110xxx and 3 valid bits)
|
||||
maskHeader4# :: Word# -> Word#
|
||||
maskHeader4# h = and# h 0x7##
|
||||
{-# INLINE maskHeader4# #-}
|
||||
|
||||
or3# :: Word# -> Word# -> Word# -> Word#
|
||||
or3# a b c = or# a (or# b c)
|
||||
{-# INLINE or3# #-}
|
||||
|
||||
or4# :: Word# -> Word# -> Word# -> Word# -> Word#
|
||||
or4# a b c d = or# (or# a b) (or# c d)
|
||||
{-# INLINE or4# #-}
|
||||
|
||||
toChar# :: Word# -> Char
|
||||
toChar# w = C# (chr# (word2Int# w))
|
||||
{-# INLINE toChar# #-}
|
||||
|
||||
toChar1 :: StepASCII -> Char
|
||||
toChar1 (StepASCII (W8# w)) = C# (word8ToChar# w)
|
||||
|
||||
toChar2 :: StepASCII -> Word8 -> Char
|
||||
toChar2 (StepASCII (W8# b1)) (W8# b2) =
|
||||
toChar# (or# (uncheckedShiftL# (maskHeader2# w1) 6#) (maskContinuation# w2))
|
||||
where
|
||||
w1 = word8ToWord# b1
|
||||
w2 = word8ToWord# b2
|
||||
|
||||
toChar3 :: StepASCII -> Word8 -> Word8 -> Char
|
||||
toChar3 (StepASCII (W8# b1)) (W8# b2) (W8# b3) =
|
||||
toChar# (or3# (uncheckedShiftL# (maskHeader3# w1) 12#)
|
||||
(uncheckedShiftL# (maskContinuation# w2) 6#)
|
||||
(maskContinuation# w3)
|
||||
)
|
||||
where
|
||||
w1 = word8ToWord# b1
|
||||
w2 = word8ToWord# b2
|
||||
w3 = word8ToWord# b3
|
||||
|
||||
toChar4 :: StepASCII -> Word8 -> Word8 -> Word8 -> Char
|
||||
toChar4 (StepASCII (W8# b1)) (W8# b2) (W8# b3) (W8# b4) =
|
||||
toChar# (or4# (uncheckedShiftL# (maskHeader4# w1) 18#)
|
||||
(uncheckedShiftL# (maskContinuation# w2) 12#)
|
||||
(uncheckedShiftL# (maskContinuation# w3) 6#)
|
||||
(maskContinuation# w4)
|
||||
)
|
||||
where
|
||||
w1 = word8ToWord# b1
|
||||
w2 = word8ToWord# b2
|
||||
w3 = word8ToWord# b3
|
||||
w4 = word8ToWord# b4
|
||||
|
||||
-- | Different way to encode a Character in UTF8 represented as an ADT
|
||||
data UTF8Char =
|
||||
UTF8_1 {-# UNPACK #-} !Word8
|
||||
| UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
|
||||
| UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
|
||||
| UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
|
||||
|
||||
-- | Transform a Unicode code point 'Char' into
|
||||
--
|
||||
-- note that we expect here a valid unicode code point in the *allowed* range.
|
||||
-- bits will be lost if going above 0x10ffff
|
||||
asUTF8Char :: Char -> UTF8Char
|
||||
asUTF8Char !(C# c)
|
||||
| bool# (ltWord# x 0x80## ) = encode1
|
||||
| bool# (ltWord# x 0x800## ) = encode2
|
||||
| bool# (ltWord# x 0x10000##) = encode3
|
||||
| otherwise = encode4
|
||||
where
|
||||
!x = int2Word# (ord# c)
|
||||
|
||||
encode1 = UTF8_1 (W8# (wordToWord8# x))
|
||||
encode2 =
|
||||
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 6#) 0xc0##))
|
||||
!x2 = toContinuation x
|
||||
in UTF8_2 x1 x2
|
||||
encode3 =
|
||||
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 12#) 0xe0##))
|
||||
!x2 = toContinuation (uncheckedShiftRL# x 6#)
|
||||
!x3 = toContinuation x
|
||||
in UTF8_3 x1 x2 x3
|
||||
encode4 =
|
||||
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 18#) 0xf0##))
|
||||
!x2 = toContinuation (uncheckedShiftRL# x 12#)
|
||||
!x3 = toContinuation (uncheckedShiftRL# x 6#)
|
||||
!x4 = toContinuation x
|
||||
in UTF8_4 x1 x2 x3 x4
|
||||
|
||||
toContinuation :: Word# -> Word8
|
||||
toContinuation w = W8# (wordToWord8# (or# (and# w 0x3f##) 0x80##))
|
||||
{-# INLINE toContinuation #-}
|
||||
|
||||
-- given the encoding of UTF8 Char, get the number of bytes of this sequence
|
||||
numBytes :: UTF8Char -> CountOf Word8
|
||||
numBytes UTF8_1{} = CountOf 1
|
||||
numBytes UTF8_2{} = CountOf 2
|
||||
numBytes UTF8_3{} = CountOf 3
|
||||
numBytes UTF8_4{} = CountOf 4
|
||||
|
||||
-- given the leading byte of a utf8 sequence, get the number of bytes of this sequence
|
||||
skipNextHeaderValue :: Word8 -> CountOf Word8
|
||||
skipNextHeaderValue !x
|
||||
| x < 0xC0 = CountOf 1 -- 0b11000000
|
||||
| x < 0xE0 = CountOf 2 -- 0b11100000
|
||||
| x < 0xF0 = CountOf 3 -- 0b11110000
|
||||
| otherwise = CountOf 4
|
||||
{-# INLINE skipNextHeaderValue #-}
|
||||
|
||||
headerIsAscii :: StepASCII -> Bool
|
||||
headerIsAscii (StepASCII x) = x < 0x80
|
||||
|
||||
charToBytes :: Int -> CountOf Word8
|
||||
charToBytes c
|
||||
| c < 0x80 = CountOf 1
|
||||
| c < 0x800 = CountOf 2
|
||||
| c < 0x10000 = CountOf 3
|
||||
| c < 0x110000 = CountOf 4
|
||||
| otherwise = error ("invalid code point: " `mappend` show c)
|
||||
|
||||
-- | Encode a Char into a CharUTF8
|
||||
encodeCharUTF8 :: Char -> CharUTF8
|
||||
encodeCharUTF8 !(C# c)
|
||||
| bool# (ltWord# x 0x80## ) = CharUTF8 (W32# (wordToWord32# x))
|
||||
| bool# (ltWord# x 0x800## ) = CharUTF8 (W32# (wordToWord32# encode2))
|
||||
| bool# (ltWord# x 0x10000##) = CharUTF8 (W32# (wordToWord32# encode3))
|
||||
| otherwise = CharUTF8 (W32# (wordToWord32# encode4))
|
||||
where
|
||||
!x = int2Word# (ord# c)
|
||||
|
||||
-- clearing mask, clearing all the bits that need to be clear as per the UTF8 encoding
|
||||
mask2 = 0x0000bfdf## -- 1 continuation , 5 bits header
|
||||
mask3 = 0x00bfbfef## -- 2 continuations, 4 bits header
|
||||
mask4 = 0xbfbfbff7## -- 3 continuations, 3 bits header
|
||||
|
||||
-- setting mask, settings all the bits that need to be set per the UTF8 encoding
|
||||
set2 = 0x000080c0## -- 10xxxxxx 110xxxxx
|
||||
set3 = 0x008080e0## -- 10xxxxxx * 2 1110xxxx
|
||||
set4 = 0x808080f0## -- 10xxxxxx * 3 11111xxx
|
||||
|
||||
encode2 = and# mask2 (or3# set2
|
||||
(uncheckedShiftRL# x 6#) -- 5 bits to 1st byte
|
||||
(uncheckedShiftL# x 8# ) -- move lowest bits to the 2nd byte
|
||||
)
|
||||
encode3 = and# mask3 (or4# set3
|
||||
(uncheckedShiftRL# x 12#) -- 4 bits to 1st byte
|
||||
(and# 0x3f00## (uncheckedShiftL# x 2#)) -- 6 bits to the 2nd byte
|
||||
(uncheckedShiftL# x 16# ) -- move lowest bits to the 3rd byte
|
||||
)
|
||||
encode4 = and# mask4 (or4# set4
|
||||
(uncheckedShiftRL# x 18#) -- 3 bits to 1st byte
|
||||
(or# (and# 0x3f00## (uncheckedShiftRL# x 4#)) -- 6 bits to the 2nd byte
|
||||
(and# 0x3f0000## (uncheckedShiftL# x 10#)) -- 6 bits to the 3nd byte
|
||||
)
|
||||
(uncheckedShiftL# x 24# ) -- move lowest bits to the 4rd byte
|
||||
)
|
||||
|
||||
-- | decode a CharUTF8 into a Char
|
||||
--
|
||||
-- If the value inside a CharUTF8 is not properly encoded, this will result in violation
|
||||
-- of the Char invariants
|
||||
decodeCharUTF8 :: CharUTF8 -> Char
|
||||
decodeCharUTF8 c@(CharUTF8 !(W32# w_))
|
||||
| isCharUTF8Case1 c = toChar# w
|
||||
| isCharUTF8Case2 c = encode2
|
||||
| isCharUTF8Case3 c = encode3
|
||||
| otherwise = encode4
|
||||
where
|
||||
w = word32ToWord# w_
|
||||
encode2 =
|
||||
toChar# (or# (uncheckedShiftL# (maskHeader2# w) 6#)
|
||||
(maskContinuation# (uncheckedShiftRL# w 8#))
|
||||
)
|
||||
encode3 =
|
||||
toChar# (or3# (uncheckedShiftL# (maskHeader3# w) 12#)
|
||||
(uncheckedShiftRL# (and# 0x3f00## w) 8#)
|
||||
(maskContinuation# (uncheckedShiftRL# w 16#))
|
||||
)
|
||||
encode4 =
|
||||
toChar# (or4# (uncheckedShiftL# (maskHeader4# w) 18#)
|
||||
(uncheckedShiftRL# (and# 0x3f00## w) 10#)
|
||||
(uncheckedShiftL# (and# 0x3f0000## w) 4#)
|
||||
(maskContinuation# (uncheckedShiftRL# w 24#))
|
||||
)
|
||||
|
||||
-- clearing mask, removing all UTF8 metadata and keeping only signal (content)
|
||||
--maskContent2 = 0x00003f1f## -- 1 continuation , 5 bits header
|
||||
--maskContent3 = 0x003f3f0f## -- 2 continuations, 4 bits header
|
||||
--maskContent4 = 0x3f3f3f07## -- 3 continuations, 3 bits header
|
||||
|
||||
isCharUTF8Case1 :: CharUTF8 -> Bool
|
||||
isCharUTF8Case1 (CharUTF8 !w) = (w .&. 0x80) == 0
|
||||
{-# INLINE isCharUTF8Case1 #-}
|
||||
|
||||
isCharUTF8Case2 :: CharUTF8 -> Bool
|
||||
isCharUTF8Case2 (CharUTF8 !w) = (w .&. 0x20) == 0
|
||||
{-# INLINE isCharUTF8Case2 #-}
|
||||
|
||||
isCharUTF8Case3 :: CharUTF8 -> Bool
|
||||
isCharUTF8Case3 (CharUTF8 !w) = (w .&. 0x10) == 0
|
||||
{-# INLINE isCharUTF8Case3 #-}
|
||||
|
||||
isCharUTF8Case4 :: CharUTF8 -> Bool
|
||||
isCharUTF8Case4 (CharUTF8 !w) = (w .&. 0x08) == 0
|
||||
{-# INLINE isCharUTF8Case4 #-}
|
||||
121
bundled/Basement/UTF8/Table.hs
Normal file
121
bundled/Basement/UTF8/Table.hs
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.UTF8.Table
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Vincent Hanquez <vincent@snarc.org>
|
||||
-- Stability : experimental
|
||||
-- Portability : portable
|
||||
--
|
||||
-- UTF8 lookup tables for fast continuation & nb bytes per header queries
|
||||
{-# LANGUAGE MagicHash #-}
|
||||
module Basement.UTF8.Table
|
||||
( isContinuation
|
||||
, isContinuation2
|
||||
, isContinuation3
|
||||
, getNbBytes
|
||||
, isContinuation#
|
||||
, isContinuationW#
|
||||
, getNbBytes#
|
||||
) where
|
||||
|
||||
import GHC.Prim (Word#, Int#, Addr#, indexWord8OffAddr#, word2Int#)
|
||||
import GHC.Types
|
||||
import GHC.Word
|
||||
import Basement.Compat.Base
|
||||
import Basement.Compat.Primitive
|
||||
import Basement.Bits
|
||||
import Basement.UTF8.Types (StepASCII(..))
|
||||
|
||||
-- | Check if the byte is a continuation byte
|
||||
isContinuation :: Word8 -> Bool
|
||||
isContinuation (W8# w) = isContinuation# w
|
||||
{-# INLINE isContinuation #-}
|
||||
|
||||
isContinuation2 :: Word8 -> Word8 -> Bool
|
||||
isContinuation2 !w1 !w2 = mask w1 && mask w2
|
||||
where
|
||||
mask v = (v .&. 0xC0) == 0x80
|
||||
{-# INLINE isContinuation2 #-}
|
||||
|
||||
isContinuation3 :: Word8 -> Word8 -> Word8 -> Bool
|
||||
isContinuation3 !w1 !w2 !w3 =
|
||||
mask w1 && mask w2 && mask w3
|
||||
where
|
||||
mask v = (v .&. 0xC0) == 0x80
|
||||
{-# INLINE isContinuation3 #-}
|
||||
|
||||
-- | Number of bytes associated with a specific header byte
|
||||
--
|
||||
-- If the header byte is invalid then NbBytesInvalid is returned,
|
||||
data NbBytesCont = NbBytesInvalid | NbBytesCont0 | NbBytesCont1 | NbBytesCont2 | NbBytesCont3
|
||||
|
||||
-- | Identical to 'NbBytesCont' but doesn't allow to represent any failure.
|
||||
--
|
||||
-- Only use in validated place
|
||||
data NbBytesCont_ = NbBytesCont0_ | NbBytesCont1_ | NbBytesCont2_ | NbBytesCont3_
|
||||
|
||||
-- | Get the number of following bytes given the first byte of a UTF8 sequence.
|
||||
getNbBytes :: StepASCII -> Int
|
||||
getNbBytes (StepASCII (W8# w)) = I# (getNbBytes# w)
|
||||
{-# INLINE getNbBytes #-}
|
||||
|
||||
-- | Check if the byte is a continuation byte
|
||||
isContinuation# :: Word8# -> Bool
|
||||
isContinuation# w = W8# (indexWord8OffAddr# (unTable contTable) (word2Int# (word8ToWord# w))) == 0
|
||||
{-# INLINE isContinuation# #-}
|
||||
|
||||
-- | Check if the byte is a continuation byte
|
||||
isContinuationW# :: Word# -> Bool
|
||||
isContinuationW# w = W8# (indexWord8OffAddr# (unTable contTable) (word2Int# w)) == 0
|
||||
{-# INLINE isContinuationW# #-}
|
||||
|
||||
-- | Get the number of following bytes given the first byte of a UTF8 sequence.
|
||||
getNbBytes# :: Word8# -> Int#
|
||||
getNbBytes# w = word8ToInt# (indexWord8OffAddr# (unTable headTable) (word2Int# (word8ToWord# w)))
|
||||
{-# INLINE getNbBytes# #-}
|
||||
|
||||
data Table = Table { unTable :: !Addr# }
|
||||
|
||||
contTable :: Table
|
||||
contTable = Table
|
||||
"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"#
|
||||
{-# NOINLINE contTable #-}
|
||||
|
||||
headTable :: Table
|
||||
headTable = Table
|
||||
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
|
||||
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
|
||||
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
|
||||
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
|
||||
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\
|
||||
\\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\
|
||||
\\x03\x03\x03\x03\x03\x03\x03\x03\xff\xff\xff\xff\xff\xff\xff\xff"#
|
||||
{-# NOINLINE headTable #-}
|
||||
73
bundled/Basement/UTF8/Types.hs
Normal file
73
bundled/Basement/UTF8/Types.hs
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
module Basement.UTF8.Types
|
||||
(
|
||||
-- * Stepper
|
||||
Step(..)
|
||||
, StepBack(..)
|
||||
, StepASCII(..)
|
||||
, StepDigit(..)
|
||||
, isValidStepASCII
|
||||
, isValidStepDigit
|
||||
-- * Unicode Errors
|
||||
, ValidationFailure(..)
|
||||
-- * UTF8 Encoded 'Char'
|
||||
, CharUTF8(..)
|
||||
-- * Case Conversion
|
||||
, CM (..)
|
||||
) where
|
||||
|
||||
import Basement.Compat.Base
|
||||
import Basement.Types.OffsetSize
|
||||
|
||||
-- | Step when walking a String
|
||||
--
|
||||
-- this is a return value composed of :
|
||||
-- * the unicode code point read (Char) which need to be
|
||||
-- between 0 and 0x10ffff (inclusive)
|
||||
-- * The next offset to start reading the next unicode code point (or end)
|
||||
data Step = Step {-# UNPACK #-} !Char {-# UNPACK #-} !(Offset Word8)
|
||||
|
||||
-- | Similar to Step but used when processing the string from the end.
|
||||
--
|
||||
-- The stepper is thus the previous character, and the offset of
|
||||
-- the beginning of the previous character
|
||||
data StepBack = StepBack {-# UNPACK #-} !Char {-# UNPACK #-} !(Offset Word8)
|
||||
|
||||
-- | Step when processing digits. the value is between 0 and 9 to be valid
|
||||
newtype StepDigit = StepDigit Word8
|
||||
|
||||
-- | Step when processing ASCII character
|
||||
newtype StepASCII = StepASCII { stepAsciiRawValue :: Word8 }
|
||||
|
||||
-- | Specialized tuple used for case mapping.
|
||||
data CM = CM {-# UNPACK #-} !Char {-# UNPACK #-} !Char {-# UNPACK #-} !Char deriving (Eq)
|
||||
|
||||
-- | Represent an already encoded UTF8 Char where the the lowest 8 bits is the start of the
|
||||
-- sequence. If this contains a multi bytes sequence then each higher 8 bits are filled with
|
||||
-- the remaining sequence 8 bits per 8 bits.
|
||||
--
|
||||
-- For example:
|
||||
-- 'A' => U+0041 => 41 => 0x00000041
|
||||
-- '€ => U+20AC => E2 82 AC => 0x00AC82E2
|
||||
-- '𐍈' => U+10348 => F0 90 8D 88 => 0x888D90F0
|
||||
--
|
||||
newtype CharUTF8 = CharUTF8 Word32
|
||||
|
||||
isValidStepASCII :: StepASCII -> Bool
|
||||
isValidStepASCII (StepASCII w) = w < 0x80
|
||||
|
||||
isValidStepDigit :: StepDigit -> Bool
|
||||
isValidStepDigit (StepDigit w) = w < 0xa
|
||||
|
||||
-- | Possible failure related to validating bytes of UTF8 sequences.
|
||||
data ValidationFailure = InvalidHeader
|
||||
| InvalidContinuation
|
||||
| MissingByte
|
||||
| BuildingFailure
|
||||
deriving (Show,Eq,Typeable)
|
||||
|
||||
instance Exception ValidationFailure
|
||||
Loading…
Add table
Add a link
Reference in a new issue