gameServer/Codec/Binary/UTF8/String.hs
changeset 1804 4e78ad846fb6
equal deleted inserted replaced
1803:95efe37482e3 1804:4e78ad846fb6
       
     1 --
       
     2 -- |
       
     3 -- Module      :  Codec.Binary.UTF8.String
       
     4 -- Copyright   :  (c) Eric Mertens 2007
       
     5 -- License     :  BSD3-style (see LICENSE)
       
     6 -- 
       
     7 -- Maintainer:    emertens@galois.com
       
     8 -- Stability   :  experimental
       
     9 -- Portability :  portable
       
    10 --
       
    11 -- Support for encoding UTF8 Strings to and from @[Word8]@
       
    12 --
       
    13 
       
    14 module Codec.Binary.UTF8.String (
       
    15       encode
       
    16     , decode
       
    17     , encodeString
       
    18     , decodeString
       
    19   ) where
       
    20 
       
    21 import Data.Word        (Word8)
       
    22 import Data.Bits        ((.|.),(.&.),shiftL,shiftR)
       
    23 import Data.Char        (chr,ord)
       
    24 
       
    25 default(Int)
       
    26 
       
    27 -- | Encode a string using 'encode' and store the result in a 'String'.
       
    28 encodeString :: String -> String
       
    29 encodeString xs = map (toEnum . fromEnum) (encode xs)
       
    30 
       
    31 -- | Decode a string using 'decode' using a 'String' as input.
       
    32 -- | This is not safe but it is necessary if UTF-8 encoded text
       
    33 -- | has been loaded into a 'String' prior to being decoded.
       
    34 decodeString :: String -> String
       
    35 decodeString xs = decode (map (toEnum . fromEnum) xs)
       
    36 
       
    37 replacement_character :: Char
       
    38 replacement_character = '\xfffd'
       
    39 
       
    40 -- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
       
    41 encode :: String -> [Word8]
       
    42 encode = concatMap (map fromIntegral . go . ord)
       
    43  where
       
    44   go oc
       
    45    | oc <= 0x7f       = [oc]
       
    46 
       
    47    | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
       
    48                         , 0x80 + oc .&. 0x3f
       
    49                         ]
       
    50 
       
    51    | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
       
    52                         , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
       
    53                         , 0x80 + oc .&. 0x3f
       
    54                         ]
       
    55    | otherwise        = [ 0xf0 + (oc `shiftR` 18)
       
    56                         , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
       
    57                         , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
       
    58                         , 0x80 + oc .&. 0x3f
       
    59                         ]
       
    60 
       
    61 --
       
    62 -- | Decode a UTF8 string packed into a list of Word8 values, directly to String
       
    63 --
       
    64 decode :: [Word8] -> String
       
    65 decode [    ] = ""
       
    66 decode (c:cs)
       
    67   | c < 0x80  = chr (fromEnum c) : decode cs
       
    68   | c < 0xc0  = replacement_character : decode cs
       
    69   | c < 0xe0  = multi1
       
    70   | c < 0xf0  = multi_byte 2 0xf  0x800
       
    71   | c < 0xf8  = multi_byte 3 0x7  0x10000
       
    72   | c < 0xfc  = multi_byte 4 0x3  0x200000
       
    73   | c < 0xfe  = multi_byte 5 0x1  0x4000000
       
    74   | otherwise = replacement_character : decode cs
       
    75   where
       
    76     multi1 = case cs of
       
    77       c1 : ds | c1 .&. 0xc0 == 0x80 ->
       
    78         let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
       
    79         in if d >= 0x000080 then toEnum d : decode ds
       
    80                             else replacement_character : decode ds
       
    81       _ -> replacement_character : decode cs
       
    82 
       
    83     multi_byte :: Int -> Word8 -> Int -> [Char]
       
    84     multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
       
    85       where
       
    86         aux 0 rs acc
       
    87           | overlong <= acc && acc <= 0x10ffff &&
       
    88             (acc < 0xd800 || 0xdfff < acc)     &&
       
    89             (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
       
    90           | otherwise = replacement_character : decode rs
       
    91 
       
    92         aux n (r:rs) acc
       
    93           | r .&. 0xc0 == 0x80 = aux (n-1) rs
       
    94                                $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
       
    95 
       
    96         aux _ rs     _ = replacement_character : decode rs
       
    97