tools/confuse.hs
author nemo
Tue, 30 Apr 2019 09:36:13 -0400
changeset 14859 8d65728c4ed0
parent 10075 dbaf90a0fbe0
permissions -rw-r--r--
Backed out changeset 13589d529899 So, we only disabled this on the release branch in r29d614a5c9eb due to having discovered it JUST before release. We should fix it properly in default...
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
10064
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     1
{-# LANGUAGE OverloadedStrings #-}
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     2
module Confuse where
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     3
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     4
import Numeric
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     5
import Data.Char
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     6
import Control.Monad
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     7
import qualified Data.ByteString as B
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     8
import qualified Data.ByteString.UTF8 as UTF8
10073
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
     9
import qualified Data.Map as Map
10064
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    10
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    11
hx :: [Char] -> String
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    12
hx cs = let ch = (chr . fst . last . readHex $ cs) in
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    13
            case ch of
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    14
                 '\'' -> "''"
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    15
                 '\\' -> "\\\\"
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    16
                 c -> c : []
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    17
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    18
conv :: String -> B.ByteString
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    19
conv s = B.concat ["('", UTF8.fromString i, "', '", UTF8.fromString r, "')"]
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    20
    where
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    21
        i :: String
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    22
        i = hx s
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    23
        r :: String
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    24
        r = concatMap hx . words . takeWhile ((/=) ';') . tail $ dropWhile ((/=) '\t') s
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    25
10073
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    26
convRules :: (B.ByteString, [B.ByteString]) -> B.ByteString
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    27
convRules (a, b) = B.concat ["<reset>", u a, "</reset>\n<s>", B.concat $ map u b, "</s>"]
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    28
    where
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    29
        u a = B.concat ["\\","u",a]
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    30
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    31
toPair :: String -> (B.ByteString, [B.ByteString])
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    32
toPair s = (UTF8.fromString $ takeWhile isHexDigit s, map UTF8.fromString . words . takeWhile ((/=) ';') . tail $ dropWhile ((/=) '\t') s)
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    33
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    34
10064
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    35
main = do
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    36
    ll <- liftM (filter (isHexDigit . head) . filter (not . null) . lines) $ readFile "confusables.txt"
10075
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    37
    B.writeFile "rules.txt" . B.intercalate "\n" . map convRules . Map.toList . Map.fromList . filter notTooLong . filter fits16bit . map toPair $ ll
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    38
    where
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    39
        notTooLong = (>) 6 . length . snd
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    40
        fits16bit (a, b) = let f = (>) 5 . B.length in all f $ a:b
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    41