Initial commit

This commit is contained in:
La Ancapo 2026-01-25 02:27:22 +01:00
commit c101616e62
309 changed files with 53937 additions and 0 deletions

View file

@ -0,0 +1,91 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.String.Encoding.ASCII7
-- License : BSD-style
-- Maintainer : Foundation
-- Stability : experimental
-- Portability : portable
--
{-# LANGUAGE MagicHash #-}
module Basement.String.Encoding.ASCII7
( ASCII7(..)
, ASCII7_Invalid(..)
) where
import Basement.Compat.Base
import Basement.Compat.Primitive
import Basement.Types.OffsetSize
import Basement.Numerical.Additive
import Basement.Monad
import Basement.Bits
import GHC.Prim (int2Word#, ord#)
import GHC.Word
import GHC.Types
import Basement.UArray
import Basement.UArray.Mutable (MUArray)
import Basement.MutableBuilder
import Basement.String.Encoding.Encoding
-- | validate a given byte is within ASCII characters encoring size
--
-- This function check the 8th bit is set to 0
--
isAscii :: Word8 -> Bool
isAscii w = (w .&. 0x80) == 0
{-# INLINE isAscii #-}
data ASCII7_Invalid
= ByteOutOfBound Word8
| CharNotAscii Char
deriving (Typeable, Show, Eq)
instance Exception ASCII7_Invalid
data ASCII7 = ASCII7
instance Encoding ASCII7 where
type Unit ASCII7 = Word8
type Error ASCII7 = ASCII7_Invalid
encodingNext _ = next
encodingWrite _ = write
-- | consume an Ascii7 char and return the Unicode point and the position
-- of the next possible Ascii7 char
--
next :: (Offset Word8 -> Word8)
-- ^ method to access a given byte
-> Offset Word8
-- ^ index of the byte
-> Either ASCII7_Invalid (Char, Offset Word8)
-- ^ either successfully validated the ASCII char and returned the
-- next index or fail with an error
next getter off
| isAscii w8 = Right (toChar w, off + 1)
| otherwise = Left $ ByteOutOfBound w8
where
!w8@(W8# w) = getter off
toChar :: Word8# -> Char
toChar a = C# (word8ToChar# w)
-- Write ascii char
--
-- > build 64 $ sequence_ write "this is a simple list of char..."
--
write :: (PrimMonad st, Monad st)
=> Char
-- ^ expecting it to be a valid Ascii character.
-- otherwise this function will throw an exception
-> Builder (UArray Word8) (MUArray Word8) Word8 st err ()
write c
| c < toEnum 0x80 = builderAppend $ w8 c
| otherwise = throw $ CharNotAscii c
where
w8 :: Char -> Word8
w8 (C# ch) = W8# (wordToWord8# (int2Word# (ord# ch)))

View file

@ -0,0 +1,107 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.String.Encoding.Encoding
-- License : BSD-style
-- Maintainer : Foundation
-- Stability : experimental
-- Portability : portable
--
{-# LANGUAGE FlexibleContexts #-}
module Basement.String.Encoding.Encoding
( Encoding(..)
, convertFromTo
) where
import Basement.Compat.Base
import Basement.Types.OffsetSize
import Basement.Monad
import Basement.PrimType
import Basement.MutableBuilder
import Basement.Numerical.Additive
import Basement.UArray (UArray)
import Basement.UArray.Mutable (MUArray)
import qualified Basement.UArray as Vec
class Encoding encoding where
-- | the unit element use for the encoding.
-- i.e. Word8 for ASCII7 or UTF8, Word16 for UTF16...
--
type Unit encoding
-- | define the type of error handling you want to use for the
-- next function.
--
-- > type Error UTF8 = Either UTF8_Invalid
--
type Error encoding
-- | consume an `Unit encoding` and return the Unicode point and the position
-- of the next possible `Unit encoding`
--
encodingNext :: encoding
-- ^ only used for type deduction
-> (Offset (Unit encoding) -> Unit encoding)
-- ^ method to access a given `Unit encoding`
-- (see `unsafeIndexer`)
-> Offset (Unit encoding)
-- ^ offset of the `Unit encoding` where starts the
-- encoding of a given unicode
-> Either (Error encoding) (Char, Offset (Unit encoding)) -- ^ either successfully validated the `Unit encoding`
-- and returned the next offset or fail with an
-- `Error encoding`
-- Write a unicode point encoded into one or multiple `Unit encoding`
--
-- > build 64 $ sequence_ (write UTF8) "this is a simple list of char..."
--
encodingWrite :: (PrimMonad st, Monad st)
=> encoding
-- ^ only used for type deduction
-> Char
-- ^ the unicode character to encode
-> Builder (UArray (Unit encoding))
(MUArray (Unit encoding))
(Unit encoding) st err ()
-- | helper to convert a given Array in a given encoding into an array
-- with another encoding.
--
-- This is a helper to convert from one String encoding to another.
-- This function is (quite) slow and needs some work.
--
-- ```
-- let s16 = ... -- string in UTF16
-- -- create s8, a UTF8 String
-- let s8 = runST $ convertWith UTF16 UTF8 (toBytes s16)
--
-- print s8
-- ```
--
convertFromTo :: ( PrimMonad st, Monad st
, Encoding input, PrimType (Unit input)
, Encoding output, PrimType (Unit output)
)
=> input
-- ^ Input's encoding type
-> output
-- ^ Output's encoding type
-> UArray (Unit input)
-- ^ the input raw array
-> st (Either (Offset (Unit input), Error input) (UArray (Unit output)))
convertFromTo inputEncodingTy outputEncodingTy bytes
| Vec.null bytes = return . return $ mempty
| otherwise = Vec.unsafeIndexer bytes $ \t -> Vec.builderBuild 64 (loop azero t)
where
lastUnit = Vec.length bytes
loop off getter
| off .==# lastUnit = return ()
| otherwise = case encodingNext inputEncodingTy getter off of
Left err -> mFail (off, err)
Right (c, noff) -> encodingWrite outputEncodingTy c >> loop noff getter

View file

@ -0,0 +1,70 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.String.Encoding.ISO_8859_1
-- License : BSD-style
-- Maintainer : Foundation
-- Stability : experimental
-- Portability : portable
--
{-# LANGUAGE MagicHash #-}
module Basement.String.Encoding.ISO_8859_1
( ISO_8859_1(..)
, ISO_8859_1_Invalid(..)
) where
import Basement.Compat.Base
import Basement.Compat.Primitive
import Basement.Types.OffsetSize
import Basement.Numerical.Additive
import Basement.Monad
import GHC.Prim (int2Word#, ord#)
import GHC.Word
import GHC.Types
import Basement.UArray
import Basement.UArray.Mutable (MUArray)
import Basement.MutableBuilder
import Basement.String.Encoding.Encoding
-- offset of size one
aone :: Offset Word8
aone = Offset 1
data ISO_8859_1_Invalid
= NotISO_8859_1 Char
deriving (Typeable, Show, Eq)
instance Exception ISO_8859_1_Invalid
data ISO_8859_1 = ISO_8859_1
instance Encoding ISO_8859_1 where
type Unit ISO_8859_1 = Word8
type Error ISO_8859_1 = ISO_8859_1_Invalid
encodingNext _ = next
encodingWrite _ = write
next :: (Offset Word8 -> Word8)
-> Offset Word8
-> Either ISO_8859_1_Invalid (Char, Offset Word8)
next getter off = Right (toChar w, off + aone)
where
!(W8# w) = getter off
toChar :: Word8# -> Char
toChar a = C# (word8ToChar# w)
write :: (PrimMonad st, Monad st)
=> Char
-> Builder (UArray Word8) (MUArray Word8) Word8 st err ()
write c@(C# ch)
| c <= toEnum 0xFF = builderAppend (W8# x)
| otherwise = throw $ NotISO_8859_1 c
where
x :: Word8#
!x = wordToWord8# (int2Word# (ord# ch))

View file

@ -0,0 +1,106 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.String.Encoding.UTF16
-- License : BSD-style
-- Maintainer : Foundation
-- Stability : experimental
-- Portability : portable
--
{-# LANGUAGE MagicHash #-}
module Basement.String.Encoding.UTF16
( UTF16(..)
, UTF16_Invalid(..)
) where
import GHC.Prim
import GHC.Word
import GHC.Types
import qualified Prelude
import Basement.Compat.Base
import Basement.Compat.Primitive
import Basement.IntegralConv
import Basement.Bits
import Basement.Types.OffsetSize
import Basement.Monad
import Basement.Numerical.Additive
import Basement.Numerical.Subtractive
import Basement.UArray
import Basement.UArray.Mutable (MUArray)
import Basement.MutableBuilder
import Basement.String.Encoding.Encoding
data UTF16_Invalid
= InvalidContinuation
| InvalidUnicode Char
deriving (Show, Eq, Typeable)
instance Exception UTF16_Invalid
data UTF16 = UTF16
instance Encoding UTF16 where
type Unit UTF16 = Word16
type Error UTF16 = UTF16_Invalid
encodingNext _ = next
encodingWrite _ = write
--
-- U+0000 to U+D7FF and U+E000 to U+FFFF : 1 bytes
-- U+10000 to U+10FFFF :
-- * 0x010000 is subtracted from the code point, leaving a 20-bit number in the range 0..0x0FFFFF.
-- * The top ten bits (a number in the range 0..0x03FF) are added to 0xD800 to give the first 16-bit code unit
-- or high surrogate, which will be in the range 0xD800..0xDBFF.
-- * The low ten bits (also in the range 0..0x03FF) are added to 0xDC00 to give the second 16-bit code unit
-- or low surrogate, which will be in the range 0xDC00..0xDFFF.
next :: (Offset Word16 -> Word16)
-> Offset Word16
-> Either UTF16_Invalid (Char, Offset Word16)
next getter off
| h < 0xd800 = Right (toChar16 h, off + Offset 1)
| h >= 0xe000 = Right (toChar16 h, off + Offset 1)
| otherwise = nextContinuation
where
h :: Word16
!h = getter off
to32 :: Word16 -> Word32
to32 (W16# w) = W32# (word16ToWord32# w)
toChar16 :: Word16 -> Char
toChar16 (W16# w) = C# (word32ToChar# (word16ToWord32# w))
nextContinuation
| cont >= 0xdc00 && cont < 0xe00 =
let !(W32# w) = ((to32 h .&. 0x3ff) .<<. 10) .|. (to32 cont .&. 0x3ff)
in Right (C# (word32ToChar# w), off + Offset 2)
| otherwise = Left InvalidContinuation
where
cont :: Word16
!cont = getter $ off + Offset 1
write :: (PrimMonad st, Monad st)
=> Char
-> Builder (UArray Word16) (MUArray Word16) Word16 st err ()
write c
| c < toEnum 0xd800 = builderAppend $ w16 c
| c > toEnum 0x10000 = let (w1, w2) = wHigh c in builderAppend w1 >> builderAppend w2
| c > toEnum 0x10ffff = throw $ InvalidUnicode c
| c >= toEnum 0xe000 = builderAppend $ w16 c
| otherwise = throw $ InvalidUnicode c
where
w16 :: Char -> Word16
w16 (C# ch) = W16# (wordToWord16# (int2Word# (ord# ch)))
to16 :: Word32 -> Word16
to16 = Prelude.fromIntegral
wHigh :: Char -> (Word16, Word16)
wHigh (C# ch) =
let v = W32# (charToWord32# ch) - 0x10000
in (0xdc00 .|. to16 (v .>>. 10), 0xd800 .|. to16 (v .&. 0x3ff))

View file

@ -0,0 +1,61 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RebindableSyntax #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : Basement.String.Encoding.UTF32
-- License : BSD-style
-- Maintainer : Foundation
-- Stability : experimental
-- Portability : portable
--
{-# LANGUAGE MagicHash #-}
module Basement.String.Encoding.UTF32
( UTF32(..)
, UTF32_Invalid
) where
import GHC.Prim
import GHC.Word
import GHC.Types
import Basement.Compat.Base
import Basement.Compat.Primitive
import Basement.Types.OffsetSize
import Basement.Monad
import Basement.Numerical.Additive
import Basement.UArray
import Basement.UArray.Mutable (MUArray)
import Basement.MutableBuilder
import Basement.String.Encoding.Encoding
data UTF32 = UTF32
data UTF32_Invalid = UTF32_Invalid
deriving (Typeable, Show, Eq, Ord, Enum, Bounded)
instance Exception UTF32_Invalid
instance Encoding UTF32 where
type Unit UTF32 = Word32
type Error UTF32 = UTF32_Invalid
encodingNext _ = next
encodingWrite _ = write
next :: (Offset Word32 -> Word32)
-> Offset Word32
-> Either UTF32_Invalid (Char, Offset Word32)
next getter off = Right (char, off + Offset 1)
where
!(W32# hh) = getter off
char :: Char
char = C# (word32ToChar# hh)
write :: (PrimMonad st, Monad st)
=> Char
-> Builder (UArray Word32) (MUArray Word32) Word32 st err ()
write c = builderAppend w32
where
!(C# ch) = c
w32 :: Word32
w32 = W32# (charToWord32# ch)