251 lines
9.2 KiB
Haskell
251 lines
9.2 KiB
Haskell
{-# LANGUAGE BangPatterns #-}
|
|
{-# LANGUAGE DeriveDataTypeable #-}
|
|
{-# LANGUAGE TypeFamilies #-}
|
|
{-# LANGUAGE RebindableSyntax #-}
|
|
{-# LANGUAGE NoImplicitPrelude #-}
|
|
-- |
|
|
-- Module : Basement.UTF8.Helper
|
|
-- License : BSD-style
|
|
-- Maintainer : Foundation
|
|
--
|
|
-- Some low level helpers to use UTF8
|
|
--
|
|
-- Most helpers are lowlevel and unsafe, don't use
|
|
-- directly.
|
|
{-# LANGUAGE BangPatterns #-}
|
|
{-# LANGUAGE MagicHash #-}
|
|
{-# LANGUAGE NoImplicitPrelude #-}
|
|
{-# LANGUAGE TypeFamilies #-}
|
|
{-# LANGUAGE FlexibleContexts #-}
|
|
{-# LANGUAGE CPP #-}
|
|
module Basement.UTF8.Helper
|
|
where
|
|
|
|
import Basement.Compat.Base
|
|
import Basement.Compat.Primitive
|
|
import Basement.Types.OffsetSize
|
|
import Basement.UTF8.Types
|
|
import Basement.Bits
|
|
import GHC.Prim
|
|
import GHC.Types
|
|
import GHC.Word
|
|
|
|
-- mask an UTF8 continuation byte (stripping the leading 10 and returning 6 valid bits)
|
|
maskContinuation# :: Word# -> Word#
|
|
maskContinuation# v = and# v 0x3f##
|
|
{-# INLINE maskContinuation# #-}
|
|
|
|
-- mask a UTF8 header for 2 bytes encoding (110xxxxx and 5 valid bits)
|
|
maskHeader2# :: Word# -> Word#
|
|
maskHeader2# h = and# h 0x1f##
|
|
{-# INLINE maskHeader2# #-}
|
|
|
|
-- mask a UTF8 header for 3 bytes encoding (1110xxxx and 4 valid bits)
|
|
maskHeader3# :: Word# -> Word#
|
|
maskHeader3# h = and# h 0xf##
|
|
{-# INLINE maskHeader3# #-}
|
|
|
|
-- mask a UTF8 header for 4 bytes encoding (11110xxx and 3 valid bits)
|
|
maskHeader4# :: Word# -> Word#
|
|
maskHeader4# h = and# h 0x7##
|
|
{-# INLINE maskHeader4# #-}
|
|
|
|
or3# :: Word# -> Word# -> Word# -> Word#
|
|
or3# a b c = or# a (or# b c)
|
|
{-# INLINE or3# #-}
|
|
|
|
or4# :: Word# -> Word# -> Word# -> Word# -> Word#
|
|
or4# a b c d = or# (or# a b) (or# c d)
|
|
{-# INLINE or4# #-}
|
|
|
|
toChar# :: Word# -> Char
|
|
toChar# w = C# (chr# (word2Int# w))
|
|
{-# INLINE toChar# #-}
|
|
|
|
toChar1 :: StepASCII -> Char
|
|
toChar1 (StepASCII (W8# w)) = C# (word8ToChar# w)
|
|
|
|
toChar2 :: StepASCII -> Word8 -> Char
|
|
toChar2 (StepASCII (W8# b1)) (W8# b2) =
|
|
toChar# (or# (uncheckedShiftL# (maskHeader2# w1) 6#) (maskContinuation# w2))
|
|
where
|
|
w1 = word8ToWord# b1
|
|
w2 = word8ToWord# b2
|
|
|
|
toChar3 :: StepASCII -> Word8 -> Word8 -> Char
|
|
toChar3 (StepASCII (W8# b1)) (W8# b2) (W8# b3) =
|
|
toChar# (or3# (uncheckedShiftL# (maskHeader3# w1) 12#)
|
|
(uncheckedShiftL# (maskContinuation# w2) 6#)
|
|
(maskContinuation# w3)
|
|
)
|
|
where
|
|
w1 = word8ToWord# b1
|
|
w2 = word8ToWord# b2
|
|
w3 = word8ToWord# b3
|
|
|
|
toChar4 :: StepASCII -> Word8 -> Word8 -> Word8 -> Char
|
|
toChar4 (StepASCII (W8# b1)) (W8# b2) (W8# b3) (W8# b4) =
|
|
toChar# (or4# (uncheckedShiftL# (maskHeader4# w1) 18#)
|
|
(uncheckedShiftL# (maskContinuation# w2) 12#)
|
|
(uncheckedShiftL# (maskContinuation# w3) 6#)
|
|
(maskContinuation# w4)
|
|
)
|
|
where
|
|
w1 = word8ToWord# b1
|
|
w2 = word8ToWord# b2
|
|
w3 = word8ToWord# b3
|
|
w4 = word8ToWord# b4
|
|
|
|
-- | Different way to encode a Character in UTF8 represented as an ADT
|
|
data UTF8Char =
|
|
UTF8_1 {-# UNPACK #-} !Word8
|
|
| UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
|
|
| UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
|
|
| UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
|
|
|
|
-- | Transform a Unicode code point 'Char' into
|
|
--
|
|
-- note that we expect here a valid unicode code point in the *allowed* range.
|
|
-- bits will be lost if going above 0x10ffff
|
|
asUTF8Char :: Char -> UTF8Char
|
|
asUTF8Char !(C# c)
|
|
| bool# (ltWord# x 0x80## ) = encode1
|
|
| bool# (ltWord# x 0x800## ) = encode2
|
|
| bool# (ltWord# x 0x10000##) = encode3
|
|
| otherwise = encode4
|
|
where
|
|
!x = int2Word# (ord# c)
|
|
|
|
encode1 = UTF8_1 (W8# (wordToWord8# x))
|
|
encode2 =
|
|
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 6#) 0xc0##))
|
|
!x2 = toContinuation x
|
|
in UTF8_2 x1 x2
|
|
encode3 =
|
|
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 12#) 0xe0##))
|
|
!x2 = toContinuation (uncheckedShiftRL# x 6#)
|
|
!x3 = toContinuation x
|
|
in UTF8_3 x1 x2 x3
|
|
encode4 =
|
|
let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 18#) 0xf0##))
|
|
!x2 = toContinuation (uncheckedShiftRL# x 12#)
|
|
!x3 = toContinuation (uncheckedShiftRL# x 6#)
|
|
!x4 = toContinuation x
|
|
in UTF8_4 x1 x2 x3 x4
|
|
|
|
toContinuation :: Word# -> Word8
|
|
toContinuation w = W8# (wordToWord8# (or# (and# w 0x3f##) 0x80##))
|
|
{-# INLINE toContinuation #-}
|
|
|
|
-- given the encoding of UTF8 Char, get the number of bytes of this sequence
|
|
numBytes :: UTF8Char -> CountOf Word8
|
|
numBytes UTF8_1{} = CountOf 1
|
|
numBytes UTF8_2{} = CountOf 2
|
|
numBytes UTF8_3{} = CountOf 3
|
|
numBytes UTF8_4{} = CountOf 4
|
|
|
|
-- given the leading byte of a utf8 sequence, get the number of bytes of this sequence
|
|
skipNextHeaderValue :: Word8 -> CountOf Word8
|
|
skipNextHeaderValue !x
|
|
| x < 0xC0 = CountOf 1 -- 0b11000000
|
|
| x < 0xE0 = CountOf 2 -- 0b11100000
|
|
| x < 0xF0 = CountOf 3 -- 0b11110000
|
|
| otherwise = CountOf 4
|
|
{-# INLINE skipNextHeaderValue #-}
|
|
|
|
headerIsAscii :: StepASCII -> Bool
|
|
headerIsAscii (StepASCII x) = x < 0x80
|
|
|
|
charToBytes :: Int -> CountOf Word8
|
|
charToBytes c
|
|
| c < 0x80 = CountOf 1
|
|
| c < 0x800 = CountOf 2
|
|
| c < 0x10000 = CountOf 3
|
|
| c < 0x110000 = CountOf 4
|
|
| otherwise = error ("invalid code point: " `mappend` show c)
|
|
|
|
-- | Encode a Char into a CharUTF8
|
|
encodeCharUTF8 :: Char -> CharUTF8
|
|
encodeCharUTF8 !(C# c)
|
|
| bool# (ltWord# x 0x80## ) = CharUTF8 (W32# (wordToWord32# x))
|
|
| bool# (ltWord# x 0x800## ) = CharUTF8 (W32# (wordToWord32# encode2))
|
|
| bool# (ltWord# x 0x10000##) = CharUTF8 (W32# (wordToWord32# encode3))
|
|
| otherwise = CharUTF8 (W32# (wordToWord32# encode4))
|
|
where
|
|
!x = int2Word# (ord# c)
|
|
|
|
-- clearing mask, clearing all the bits that need to be clear as per the UTF8 encoding
|
|
mask2 = 0x0000bfdf## -- 1 continuation , 5 bits header
|
|
mask3 = 0x00bfbfef## -- 2 continuations, 4 bits header
|
|
mask4 = 0xbfbfbff7## -- 3 continuations, 3 bits header
|
|
|
|
-- setting mask, settings all the bits that need to be set per the UTF8 encoding
|
|
set2 = 0x000080c0## -- 10xxxxxx 110xxxxx
|
|
set3 = 0x008080e0## -- 10xxxxxx * 2 1110xxxx
|
|
set4 = 0x808080f0## -- 10xxxxxx * 3 11111xxx
|
|
|
|
encode2 = and# mask2 (or3# set2
|
|
(uncheckedShiftRL# x 6#) -- 5 bits to 1st byte
|
|
(uncheckedShiftL# x 8# ) -- move lowest bits to the 2nd byte
|
|
)
|
|
encode3 = and# mask3 (or4# set3
|
|
(uncheckedShiftRL# x 12#) -- 4 bits to 1st byte
|
|
(and# 0x3f00## (uncheckedShiftL# x 2#)) -- 6 bits to the 2nd byte
|
|
(uncheckedShiftL# x 16# ) -- move lowest bits to the 3rd byte
|
|
)
|
|
encode4 = and# mask4 (or4# set4
|
|
(uncheckedShiftRL# x 18#) -- 3 bits to 1st byte
|
|
(or# (and# 0x3f00## (uncheckedShiftRL# x 4#)) -- 6 bits to the 2nd byte
|
|
(and# 0x3f0000## (uncheckedShiftL# x 10#)) -- 6 bits to the 3nd byte
|
|
)
|
|
(uncheckedShiftL# x 24# ) -- move lowest bits to the 4rd byte
|
|
)
|
|
|
|
-- | decode a CharUTF8 into a Char
|
|
--
|
|
-- If the value inside a CharUTF8 is not properly encoded, this will result in violation
|
|
-- of the Char invariants
|
|
decodeCharUTF8 :: CharUTF8 -> Char
|
|
decodeCharUTF8 c@(CharUTF8 !(W32# w_))
|
|
| isCharUTF8Case1 c = toChar# w
|
|
| isCharUTF8Case2 c = encode2
|
|
| isCharUTF8Case3 c = encode3
|
|
| otherwise = encode4
|
|
where
|
|
w = word32ToWord# w_
|
|
encode2 =
|
|
toChar# (or# (uncheckedShiftL# (maskHeader2# w) 6#)
|
|
(maskContinuation# (uncheckedShiftRL# w 8#))
|
|
)
|
|
encode3 =
|
|
toChar# (or3# (uncheckedShiftL# (maskHeader3# w) 12#)
|
|
(uncheckedShiftRL# (and# 0x3f00## w) 8#)
|
|
(maskContinuation# (uncheckedShiftRL# w 16#))
|
|
)
|
|
encode4 =
|
|
toChar# (or4# (uncheckedShiftL# (maskHeader4# w) 18#)
|
|
(uncheckedShiftRL# (and# 0x3f00## w) 10#)
|
|
(uncheckedShiftL# (and# 0x3f0000## w) 4#)
|
|
(maskContinuation# (uncheckedShiftRL# w 24#))
|
|
)
|
|
|
|
-- clearing mask, removing all UTF8 metadata and keeping only signal (content)
|
|
--maskContent2 = 0x00003f1f## -- 1 continuation , 5 bits header
|
|
--maskContent3 = 0x003f3f0f## -- 2 continuations, 4 bits header
|
|
--maskContent4 = 0x3f3f3f07## -- 3 continuations, 3 bits header
|
|
|
|
isCharUTF8Case1 :: CharUTF8 -> Bool
|
|
isCharUTF8Case1 (CharUTF8 !w) = (w .&. 0x80) == 0
|
|
{-# INLINE isCharUTF8Case1 #-}
|
|
|
|
isCharUTF8Case2 :: CharUTF8 -> Bool
|
|
isCharUTF8Case2 (CharUTF8 !w) = (w .&. 0x20) == 0
|
|
{-# INLINE isCharUTF8Case2 #-}
|
|
|
|
isCharUTF8Case3 :: CharUTF8 -> Bool
|
|
isCharUTF8Case3 (CharUTF8 !w) = (w .&. 0x10) == 0
|
|
{-# INLINE isCharUTF8Case3 #-}
|
|
|
|
isCharUTF8Case4 :: CharUTF8 -> Bool
|
|
isCharUTF8Case4 (CharUTF8 !w) = (w .&. 0x08) == 0
|
|
{-# INLINE isCharUTF8Case4 #-}
|