Initial commit
This commit is contained in:
commit
c101616e62
309 changed files with 53937 additions and 0 deletions
91
bundled/Basement/String/Encoding/ASCII7.hs
Normal file
91
bundled/Basement/String/Encoding/ASCII7.hs
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.String.Encoding.ASCII7
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Foundation
|
||||
-- Stability : experimental
|
||||
-- Portability : portable
|
||||
--
|
||||
|
||||
{-# LANGUAGE MagicHash #-}
|
||||
|
||||
module Basement.String.Encoding.ASCII7
|
||||
( ASCII7(..)
|
||||
, ASCII7_Invalid(..)
|
||||
) where
|
||||
|
||||
import Basement.Compat.Base
|
||||
import Basement.Compat.Primitive
|
||||
import Basement.Types.OffsetSize
|
||||
import Basement.Numerical.Additive
|
||||
import Basement.Monad
|
||||
import Basement.Bits
|
||||
|
||||
import GHC.Prim (int2Word#, ord#)
|
||||
import GHC.Word
|
||||
import GHC.Types
|
||||
import Basement.UArray
|
||||
import Basement.UArray.Mutable (MUArray)
|
||||
import Basement.MutableBuilder
|
||||
|
||||
import Basement.String.Encoding.Encoding
|
||||
|
||||
-- | validate a given byte is within ASCII characters encoring size
|
||||
--
|
||||
-- This function check the 8th bit is set to 0
|
||||
--
|
||||
isAscii :: Word8 -> Bool
|
||||
isAscii w = (w .&. 0x80) == 0
|
||||
{-# INLINE isAscii #-}
|
||||
|
||||
data ASCII7_Invalid
|
||||
= ByteOutOfBound Word8
|
||||
| CharNotAscii Char
|
||||
deriving (Typeable, Show, Eq)
|
||||
instance Exception ASCII7_Invalid
|
||||
|
||||
data ASCII7 = ASCII7
|
||||
|
||||
instance Encoding ASCII7 where
|
||||
type Unit ASCII7 = Word8
|
||||
type Error ASCII7 = ASCII7_Invalid
|
||||
encodingNext _ = next
|
||||
encodingWrite _ = write
|
||||
|
||||
-- | consume an Ascii7 char and return the Unicode point and the position
|
||||
-- of the next possible Ascii7 char
|
||||
--
|
||||
next :: (Offset Word8 -> Word8)
|
||||
-- ^ method to access a given byte
|
||||
-> Offset Word8
|
||||
-- ^ index of the byte
|
||||
-> Either ASCII7_Invalid (Char, Offset Word8)
|
||||
-- ^ either successfully validated the ASCII char and returned the
|
||||
-- next index or fail with an error
|
||||
next getter off
|
||||
| isAscii w8 = Right (toChar w, off + 1)
|
||||
| otherwise = Left $ ByteOutOfBound w8
|
||||
where
|
||||
!w8@(W8# w) = getter off
|
||||
toChar :: Word8# -> Char
|
||||
toChar a = C# (word8ToChar# w)
|
||||
|
||||
-- Write ascii char
|
||||
--
|
||||
-- > build 64 $ sequence_ write "this is a simple list of char..."
|
||||
--
|
||||
write :: (PrimMonad st, Monad st)
|
||||
=> Char
|
||||
-- ^ expecting it to be a valid Ascii character.
|
||||
-- otherwise this function will throw an exception
|
||||
-> Builder (UArray Word8) (MUArray Word8) Word8 st err ()
|
||||
write c
|
||||
| c < toEnum 0x80 = builderAppend $ w8 c
|
||||
| otherwise = throw $ CharNotAscii c
|
||||
where
|
||||
w8 :: Char -> Word8
|
||||
w8 (C# ch) = W8# (wordToWord8# (int2Word# (ord# ch)))
|
||||
107
bundled/Basement/String/Encoding/Encoding.hs
Normal file
107
bundled/Basement/String/Encoding/Encoding.hs
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.String.Encoding.Encoding
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Foundation
|
||||
-- Stability : experimental
|
||||
-- Portability : portable
|
||||
--
|
||||
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
|
||||
module Basement.String.Encoding.Encoding
|
||||
( Encoding(..)
|
||||
, convertFromTo
|
||||
) where
|
||||
|
||||
import Basement.Compat.Base
|
||||
import Basement.Types.OffsetSize
|
||||
import Basement.Monad
|
||||
import Basement.PrimType
|
||||
import Basement.MutableBuilder
|
||||
import Basement.Numerical.Additive
|
||||
import Basement.UArray (UArray)
|
||||
import Basement.UArray.Mutable (MUArray)
|
||||
import qualified Basement.UArray as Vec
|
||||
|
||||
class Encoding encoding where
|
||||
-- | the unit element use for the encoding.
|
||||
-- i.e. Word8 for ASCII7 or UTF8, Word16 for UTF16...
|
||||
--
|
||||
type Unit encoding
|
||||
|
||||
-- | define the type of error handling you want to use for the
|
||||
-- next function.
|
||||
--
|
||||
-- > type Error UTF8 = Either UTF8_Invalid
|
||||
--
|
||||
type Error encoding
|
||||
|
||||
-- | consume an `Unit encoding` and return the Unicode point and the position
|
||||
-- of the next possible `Unit encoding`
|
||||
--
|
||||
encodingNext :: encoding
|
||||
-- ^ only used for type deduction
|
||||
-> (Offset (Unit encoding) -> Unit encoding)
|
||||
-- ^ method to access a given `Unit encoding`
|
||||
-- (see `unsafeIndexer`)
|
||||
-> Offset (Unit encoding)
|
||||
-- ^ offset of the `Unit encoding` where starts the
|
||||
-- encoding of a given unicode
|
||||
-> Either (Error encoding) (Char, Offset (Unit encoding)) -- ^ either successfully validated the `Unit encoding`
|
||||
-- and returned the next offset or fail with an
|
||||
-- `Error encoding`
|
||||
|
||||
-- Write a unicode point encoded into one or multiple `Unit encoding`
|
||||
--
|
||||
-- > build 64 $ sequence_ (write UTF8) "this is a simple list of char..."
|
||||
--
|
||||
encodingWrite :: (PrimMonad st, Monad st)
|
||||
=> encoding
|
||||
-- ^ only used for type deduction
|
||||
-> Char
|
||||
-- ^ the unicode character to encode
|
||||
-> Builder (UArray (Unit encoding))
|
||||
(MUArray (Unit encoding))
|
||||
(Unit encoding) st err ()
|
||||
|
||||
-- | helper to convert a given Array in a given encoding into an array
|
||||
-- with another encoding.
|
||||
--
|
||||
-- This is a helper to convert from one String encoding to another.
|
||||
-- This function is (quite) slow and needs some work.
|
||||
--
|
||||
-- ```
|
||||
-- let s16 = ... -- string in UTF16
|
||||
-- -- create s8, a UTF8 String
|
||||
-- let s8 = runST $ convertWith UTF16 UTF8 (toBytes s16)
|
||||
--
|
||||
-- print s8
|
||||
-- ```
|
||||
--
|
||||
convertFromTo :: ( PrimMonad st, Monad st
|
||||
, Encoding input, PrimType (Unit input)
|
||||
, Encoding output, PrimType (Unit output)
|
||||
)
|
||||
=> input
|
||||
-- ^ Input's encoding type
|
||||
-> output
|
||||
-- ^ Output's encoding type
|
||||
-> UArray (Unit input)
|
||||
-- ^ the input raw array
|
||||
-> st (Either (Offset (Unit input), Error input) (UArray (Unit output)))
|
||||
convertFromTo inputEncodingTy outputEncodingTy bytes
|
||||
| Vec.null bytes = return . return $ mempty
|
||||
| otherwise = Vec.unsafeIndexer bytes $ \t -> Vec.builderBuild 64 (loop azero t)
|
||||
where
|
||||
lastUnit = Vec.length bytes
|
||||
|
||||
loop off getter
|
||||
| off .==# lastUnit = return ()
|
||||
| otherwise = case encodingNext inputEncodingTy getter off of
|
||||
Left err -> mFail (off, err)
|
||||
Right (c, noff) -> encodingWrite outputEncodingTy c >> loop noff getter
|
||||
70
bundled/Basement/String/Encoding/ISO_8859_1.hs
Normal file
70
bundled/Basement/String/Encoding/ISO_8859_1.hs
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.String.Encoding.ISO_8859_1
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Foundation
|
||||
-- Stability : experimental
|
||||
-- Portability : portable
|
||||
--
|
||||
|
||||
{-# LANGUAGE MagicHash #-}
|
||||
|
||||
module Basement.String.Encoding.ISO_8859_1
|
||||
( ISO_8859_1(..)
|
||||
, ISO_8859_1_Invalid(..)
|
||||
) where
|
||||
|
||||
import Basement.Compat.Base
|
||||
import Basement.Compat.Primitive
|
||||
import Basement.Types.OffsetSize
|
||||
import Basement.Numerical.Additive
|
||||
import Basement.Monad
|
||||
|
||||
import GHC.Prim (int2Word#, ord#)
|
||||
import GHC.Word
|
||||
import GHC.Types
|
||||
import Basement.UArray
|
||||
import Basement.UArray.Mutable (MUArray)
|
||||
import Basement.MutableBuilder
|
||||
|
||||
import Basement.String.Encoding.Encoding
|
||||
|
||||
-- offset of size one
|
||||
aone :: Offset Word8
|
||||
aone = Offset 1
|
||||
|
||||
data ISO_8859_1_Invalid
|
||||
= NotISO_8859_1 Char
|
||||
deriving (Typeable, Show, Eq)
|
||||
instance Exception ISO_8859_1_Invalid
|
||||
|
||||
data ISO_8859_1 = ISO_8859_1
|
||||
|
||||
instance Encoding ISO_8859_1 where
|
||||
type Unit ISO_8859_1 = Word8
|
||||
type Error ISO_8859_1 = ISO_8859_1_Invalid
|
||||
encodingNext _ = next
|
||||
encodingWrite _ = write
|
||||
|
||||
next :: (Offset Word8 -> Word8)
|
||||
-> Offset Word8
|
||||
-> Either ISO_8859_1_Invalid (Char, Offset Word8)
|
||||
next getter off = Right (toChar w, off + aone)
|
||||
where
|
||||
!(W8# w) = getter off
|
||||
toChar :: Word8# -> Char
|
||||
toChar a = C# (word8ToChar# w)
|
||||
|
||||
write :: (PrimMonad st, Monad st)
|
||||
=> Char
|
||||
-> Builder (UArray Word8) (MUArray Word8) Word8 st err ()
|
||||
write c@(C# ch)
|
||||
| c <= toEnum 0xFF = builderAppend (W8# x)
|
||||
| otherwise = throw $ NotISO_8859_1 c
|
||||
where
|
||||
x :: Word8#
|
||||
!x = wordToWord8# (int2Word# (ord# ch))
|
||||
106
bundled/Basement/String/Encoding/UTF16.hs
Normal file
106
bundled/Basement/String/Encoding/UTF16.hs
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.String.Encoding.UTF16
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Foundation
|
||||
-- Stability : experimental
|
||||
-- Portability : portable
|
||||
--
|
||||
{-# LANGUAGE MagicHash #-}
|
||||
module Basement.String.Encoding.UTF16
|
||||
( UTF16(..)
|
||||
, UTF16_Invalid(..)
|
||||
) where
|
||||
|
||||
import GHC.Prim
|
||||
import GHC.Word
|
||||
import GHC.Types
|
||||
import qualified Prelude
|
||||
import Basement.Compat.Base
|
||||
import Basement.Compat.Primitive
|
||||
import Basement.IntegralConv
|
||||
import Basement.Bits
|
||||
import Basement.Types.OffsetSize
|
||||
import Basement.Monad
|
||||
import Basement.Numerical.Additive
|
||||
import Basement.Numerical.Subtractive
|
||||
import Basement.UArray
|
||||
import Basement.UArray.Mutable (MUArray)
|
||||
import Basement.MutableBuilder
|
||||
|
||||
import Basement.String.Encoding.Encoding
|
||||
|
||||
data UTF16_Invalid
|
||||
= InvalidContinuation
|
||||
| InvalidUnicode Char
|
||||
deriving (Show, Eq, Typeable)
|
||||
instance Exception UTF16_Invalid
|
||||
|
||||
data UTF16 = UTF16
|
||||
|
||||
instance Encoding UTF16 where
|
||||
type Unit UTF16 = Word16
|
||||
type Error UTF16 = UTF16_Invalid
|
||||
encodingNext _ = next
|
||||
encodingWrite _ = write
|
||||
|
||||
|
||||
--
|
||||
-- U+0000 to U+D7FF and U+E000 to U+FFFF : 1 bytes
|
||||
-- U+10000 to U+10FFFF :
|
||||
-- * 0x010000 is subtracted from the code point, leaving a 20-bit number in the range 0..0x0FFFFF.
|
||||
-- * The top ten bits (a number in the range 0..0x03FF) are added to 0xD800 to give the first 16-bit code unit
|
||||
-- or high surrogate, which will be in the range 0xD800..0xDBFF.
|
||||
-- * The low ten bits (also in the range 0..0x03FF) are added to 0xDC00 to give the second 16-bit code unit
|
||||
-- or low surrogate, which will be in the range 0xDC00..0xDFFF.
|
||||
|
||||
next :: (Offset Word16 -> Word16)
|
||||
-> Offset Word16
|
||||
-> Either UTF16_Invalid (Char, Offset Word16)
|
||||
next getter off
|
||||
| h < 0xd800 = Right (toChar16 h, off + Offset 1)
|
||||
| h >= 0xe000 = Right (toChar16 h, off + Offset 1)
|
||||
| otherwise = nextContinuation
|
||||
where
|
||||
h :: Word16
|
||||
!h = getter off
|
||||
|
||||
to32 :: Word16 -> Word32
|
||||
to32 (W16# w) = W32# (word16ToWord32# w)
|
||||
|
||||
toChar16 :: Word16 -> Char
|
||||
toChar16 (W16# w) = C# (word32ToChar# (word16ToWord32# w))
|
||||
|
||||
nextContinuation
|
||||
| cont >= 0xdc00 && cont < 0xe00 =
|
||||
let !(W32# w) = ((to32 h .&. 0x3ff) .<<. 10) .|. (to32 cont .&. 0x3ff)
|
||||
in Right (C# (word32ToChar# w), off + Offset 2)
|
||||
| otherwise = Left InvalidContinuation
|
||||
where
|
||||
cont :: Word16
|
||||
!cont = getter $ off + Offset 1
|
||||
|
||||
write :: (PrimMonad st, Monad st)
|
||||
=> Char
|
||||
-> Builder (UArray Word16) (MUArray Word16) Word16 st err ()
|
||||
write c
|
||||
| c < toEnum 0xd800 = builderAppend $ w16 c
|
||||
| c > toEnum 0x10000 = let (w1, w2) = wHigh c in builderAppend w1 >> builderAppend w2
|
||||
| c > toEnum 0x10ffff = throw $ InvalidUnicode c
|
||||
| c >= toEnum 0xe000 = builderAppend $ w16 c
|
||||
| otherwise = throw $ InvalidUnicode c
|
||||
where
|
||||
w16 :: Char -> Word16
|
||||
w16 (C# ch) = W16# (wordToWord16# (int2Word# (ord# ch)))
|
||||
|
||||
to16 :: Word32 -> Word16
|
||||
to16 = Prelude.fromIntegral
|
||||
|
||||
wHigh :: Char -> (Word16, Word16)
|
||||
wHigh (C# ch) =
|
||||
let v = W32# (charToWord32# ch) - 0x10000
|
||||
in (0xdc00 .|. to16 (v .>>. 10), 0xd800 .|. to16 (v .&. 0x3ff))
|
||||
61
bundled/Basement/String/Encoding/UTF32.hs
Normal file
61
bundled/Basement/String/Encoding/UTF32.hs
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
{-# LANGUAGE TypeFamilies #-}
|
||||
{-# LANGUAGE RebindableSyntax #-}
|
||||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
-- |
|
||||
-- Module : Basement.String.Encoding.UTF32
|
||||
-- License : BSD-style
|
||||
-- Maintainer : Foundation
|
||||
-- Stability : experimental
|
||||
-- Portability : portable
|
||||
--
|
||||
{-# LANGUAGE MagicHash #-}
|
||||
module Basement.String.Encoding.UTF32
|
||||
( UTF32(..)
|
||||
, UTF32_Invalid
|
||||
) where
|
||||
|
||||
import GHC.Prim
|
||||
import GHC.Word
|
||||
import GHC.Types
|
||||
import Basement.Compat.Base
|
||||
import Basement.Compat.Primitive
|
||||
import Basement.Types.OffsetSize
|
||||
import Basement.Monad
|
||||
import Basement.Numerical.Additive
|
||||
import Basement.UArray
|
||||
import Basement.UArray.Mutable (MUArray)
|
||||
import Basement.MutableBuilder
|
||||
|
||||
import Basement.String.Encoding.Encoding
|
||||
|
||||
data UTF32 = UTF32
|
||||
|
||||
data UTF32_Invalid = UTF32_Invalid
|
||||
deriving (Typeable, Show, Eq, Ord, Enum, Bounded)
|
||||
instance Exception UTF32_Invalid
|
||||
|
||||
instance Encoding UTF32 where
|
||||
type Unit UTF32 = Word32
|
||||
type Error UTF32 = UTF32_Invalid
|
||||
encodingNext _ = next
|
||||
encodingWrite _ = write
|
||||
|
||||
next :: (Offset Word32 -> Word32)
|
||||
-> Offset Word32
|
||||
-> Either UTF32_Invalid (Char, Offset Word32)
|
||||
next getter off = Right (char, off + Offset 1)
|
||||
where
|
||||
!(W32# hh) = getter off
|
||||
char :: Char
|
||||
char = C# (word32ToChar# hh)
|
||||
|
||||
write :: (PrimMonad st, Monad st)
|
||||
=> Char
|
||||
-> Builder (UArray Word32) (MUArray Word32) Word32 st err ()
|
||||
write c = builderAppend w32
|
||||
where
|
||||
!(C# ch) = c
|
||||
w32 :: Word32
|
||||
w32 = W32# (charToWord32# ch)
|
||||
Loading…
Add table
Add a link
Reference in a new issue