107 lines
3.2 KiB
Haskell
107 lines
3.2 KiB
Haskell
|
|
{-# LANGUAGE BangPatterns #-}
|
||
|
|
{-# LANGUAGE DeriveDataTypeable #-}
|
||
|
|
{-# LANGUAGE TypeFamilies #-}
|
||
|
|
{-# LANGUAGE RebindableSyntax #-}
|
||
|
|
{-# LANGUAGE NoImplicitPrelude #-}
|
||
|
|
-- |
|
||
|
|
-- Module : Basement.String.Encoding.UTF16
|
||
|
|
-- License : BSD-style
|
||
|
|
-- Maintainer : Foundation
|
||
|
|
-- Stability : experimental
|
||
|
|
-- Portability : portable
|
||
|
|
--
|
||
|
|
{-# LANGUAGE MagicHash #-}
|
||
|
|
module Basement.String.Encoding.UTF16
|
||
|
|
( UTF16(..)
|
||
|
|
, UTF16_Invalid(..)
|
||
|
|
) where
|
||
|
|
|
||
|
|
import GHC.Prim
|
||
|
|
import GHC.Word
|
||
|
|
import GHC.Types
|
||
|
|
import qualified Prelude
|
||
|
|
import Basement.Compat.Base
|
||
|
|
import Basement.Compat.Primitive
|
||
|
|
import Basement.IntegralConv
|
||
|
|
import Basement.Bits
|
||
|
|
import Basement.Types.OffsetSize
|
||
|
|
import Basement.Monad
|
||
|
|
import Basement.Numerical.Additive
|
||
|
|
import Basement.Numerical.Subtractive
|
||
|
|
import Basement.UArray
|
||
|
|
import Basement.UArray.Mutable (MUArray)
|
||
|
|
import Basement.MutableBuilder
|
||
|
|
|
||
|
|
import Basement.String.Encoding.Encoding
|
||
|
|
|
||
|
|
data UTF16_Invalid
|
||
|
|
= InvalidContinuation
|
||
|
|
| InvalidUnicode Char
|
||
|
|
deriving (Show, Eq, Typeable)
|
||
|
|
instance Exception UTF16_Invalid
|
||
|
|
|
||
|
|
data UTF16 = UTF16
|
||
|
|
|
||
|
|
instance Encoding UTF16 where
|
||
|
|
type Unit UTF16 = Word16
|
||
|
|
type Error UTF16 = UTF16_Invalid
|
||
|
|
encodingNext _ = next
|
||
|
|
encodingWrite _ = write
|
||
|
|
|
||
|
|
|
||
|
|
--
|
||
|
|
-- U+0000 to U+D7FF and U+E000 to U+FFFF : 1 bytes
|
||
|
|
-- U+10000 to U+10FFFF :
|
||
|
|
-- * 0x010000 is subtracted from the code point, leaving a 20-bit number in the range 0..0x0FFFFF.
|
||
|
|
-- * The top ten bits (a number in the range 0..0x03FF) are added to 0xD800 to give the first 16-bit code unit
|
||
|
|
-- or high surrogate, which will be in the range 0xD800..0xDBFF.
|
||
|
|
-- * The low ten bits (also in the range 0..0x03FF) are added to 0xDC00 to give the second 16-bit code unit
|
||
|
|
-- or low surrogate, which will be in the range 0xDC00..0xDFFF.
|
||
|
|
|
||
|
|
next :: (Offset Word16 -> Word16)
|
||
|
|
-> Offset Word16
|
||
|
|
-> Either UTF16_Invalid (Char, Offset Word16)
|
||
|
|
next getter off
|
||
|
|
| h < 0xd800 = Right (toChar16 h, off + Offset 1)
|
||
|
|
| h >= 0xe000 = Right (toChar16 h, off + Offset 1)
|
||
|
|
| otherwise = nextContinuation
|
||
|
|
where
|
||
|
|
h :: Word16
|
||
|
|
!h = getter off
|
||
|
|
|
||
|
|
to32 :: Word16 -> Word32
|
||
|
|
to32 (W16# w) = W32# (word16ToWord32# w)
|
||
|
|
|
||
|
|
toChar16 :: Word16 -> Char
|
||
|
|
toChar16 (W16# w) = C# (word32ToChar# (word16ToWord32# w))
|
||
|
|
|
||
|
|
nextContinuation
|
||
|
|
| cont >= 0xdc00 && cont < 0xe00 =
|
||
|
|
let !(W32# w) = ((to32 h .&. 0x3ff) .<<. 10) .|. (to32 cont .&. 0x3ff)
|
||
|
|
in Right (C# (word32ToChar# w), off + Offset 2)
|
||
|
|
| otherwise = Left InvalidContinuation
|
||
|
|
where
|
||
|
|
cont :: Word16
|
||
|
|
!cont = getter $ off + Offset 1
|
||
|
|
|
||
|
|
write :: (PrimMonad st, Monad st)
|
||
|
|
=> Char
|
||
|
|
-> Builder (UArray Word16) (MUArray Word16) Word16 st err ()
|
||
|
|
write c
|
||
|
|
| c < toEnum 0xd800 = builderAppend $ w16 c
|
||
|
|
| c > toEnum 0x10000 = let (w1, w2) = wHigh c in builderAppend w1 >> builderAppend w2
|
||
|
|
| c > toEnum 0x10ffff = throw $ InvalidUnicode c
|
||
|
|
| c >= toEnum 0xe000 = builderAppend $ w16 c
|
||
|
|
| otherwise = throw $ InvalidUnicode c
|
||
|
|
where
|
||
|
|
w16 :: Char -> Word16
|
||
|
|
w16 (C# ch) = W16# (wordToWord16# (int2Word# (ord# ch)))
|
||
|
|
|
||
|
|
to16 :: Word32 -> Word16
|
||
|
|
to16 = Prelude.fromIntegral
|
||
|
|
|
||
|
|
wHigh :: Char -> (Word16, Word16)
|
||
|
|
wHigh (C# ch) =
|
||
|
|
let v = W32# (charToWord32# ch) - 0x10000
|
||
|
|
in (0xdc00 .|. to16 (v .>>. 10), 0xd800 .|. to16 (v .&. 0x3ff))
|