netserver/Codec/Binary/UTF8/String.hs
author unc0rr
Sun, 25 Jan 2009 13:48:41 +0000
changeset 1747 44a6a9924c6d
permissions -rw-r--r--
Add needed libraries
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
1747
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     1
--
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     2
-- |
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     3
-- Module      :  Codec.Binary.UTF8.String
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     4
-- Copyright   :  (c) Eric Mertens 2007
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     5
-- License     :  BSD3-style (see LICENSE)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     6
-- 
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     7
-- Maintainer:    emertens@galois.com
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     8
-- Stability   :  experimental
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
     9
-- Portability :  portable
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    10
--
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    11
-- Support for encoding UTF8 Strings to and from @[Word8]@
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    12
--
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    13
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    14
module Codec.Binary.UTF8.String (
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    15
      encode
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    16
    , decode
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    17
    , encodeString
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    18
    , decodeString
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    19
  ) where
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    20
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    21
import Data.Word        (Word8)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    22
import Data.Bits        ((.|.),(.&.),shiftL,shiftR)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    23
import Data.Char        (chr,ord)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    24
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    25
default(Int)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    26
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    27
-- | Encode a string using 'encode' and store the result in a 'String'.
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    28
encodeString :: String -> String
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    29
encodeString xs = map (toEnum . fromEnum) (encode xs)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    30
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    31
-- | Decode a string using 'decode' using a 'String' as input.
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    32
-- | This is not safe but it is necessary if UTF-8 encoded text
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    33
-- | has been loaded into a 'String' prior to being decoded.
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    34
decodeString :: String -> String
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    35
decodeString xs = decode (map (toEnum . fromEnum) xs)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    36
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    37
replacement_character :: Char
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    38
replacement_character = '\xfffd'
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    39
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    40
-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    41
encode :: String -> [Word8]
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    42
encode = concatMap (map fromIntegral . go . ord)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    43
 where
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    44
  go oc
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    45
   | oc <= 0x7f       = [oc]
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    46
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    47
   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    48
                        , 0x80 + oc .&. 0x3f
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    49
                        ]
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    50
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    51
   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    52
                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    53
                        , 0x80 + oc .&. 0x3f
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    54
                        ]
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    55
   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    56
                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    57
                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    58
                        , 0x80 + oc .&. 0x3f
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    59
                        ]
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    60
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    61
--
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    62
-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    63
--
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    64
decode :: [Word8] -> String
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    65
decode [    ] = ""
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    66
decode (c:cs)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    67
  | c < 0x80  = chr (fromEnum c) : decode cs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    68
  | c < 0xc0  = replacement_character : decode cs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    69
  | c < 0xe0  = multi1
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    70
  | c < 0xf0  = multi_byte 2 0xf  0x800
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    71
  | c < 0xf8  = multi_byte 3 0x7  0x10000
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    72
  | c < 0xfc  = multi_byte 4 0x3  0x200000
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    73
  | c < 0xfe  = multi_byte 5 0x1  0x4000000
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    74
  | otherwise = replacement_character : decode cs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    75
  where
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    76
    multi1 = case cs of
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    77
      c1 : ds | c1 .&. 0xc0 == 0x80 ->
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    78
        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    79
        in if d >= 0x000080 then toEnum d : decode ds
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    80
                            else replacement_character : decode ds
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    81
      _ -> replacement_character : decode cs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    82
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    83
    multi_byte :: Int -> Word8 -> Int -> [Char]
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    84
    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    85
      where
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    86
        aux 0 rs acc
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    87
          | overlong <= acc && acc <= 0x10ffff &&
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    88
            (acc < 0xd800 || 0xdfff < acc)     &&
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    89
            (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    90
          | otherwise = replacement_character : decode rs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    91
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    92
        aux n (r:rs) acc
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    93
          | r .&. 0xc0 == 0x80 = aux (n-1) rs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    94
                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    95
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    96
        aux _ rs     _ = replacement_character : decode rs
44a6a9924c6d Add needed libraries
unc0rr
parents:
diff changeset
    97