From 67e82e87582e586c1eaf573ac72a58302522f396 Mon Sep 17 00:00:00 2001 From: sternenseemann <0rpkxez4ksa01gb3typccl0i@systemli.org> Date: Mon, 31 Aug 2020 14:04:40 +0200 Subject: refactor(EmojiTest): move emoji-test.txt parser into own module --- emoji-generic.cabal | 2 +- src/Text/Emoji/DataFiles.hs | 125 ------------------------------ src/Text/Emoji/DataFiles/EmojiTest.hs | 142 ++++++++++++++++++++++++++++++++++ test/Main.hs | 2 +- 4 files changed, 144 insertions(+), 127 deletions(-) delete mode 100644 src/Text/Emoji/DataFiles.hs create mode 100644 src/Text/Emoji/DataFiles/EmojiTest.hs diff --git a/emoji-generic.cabal b/emoji-generic.cabal index 23362c0..4f2c976 100644 --- a/emoji-generic.cabal +++ b/emoji-generic.cabal @@ -17,7 +17,7 @@ extra-source-files: README.md library exposed-modules: Text.Emoji - , Text.Emoji.DataFiles + , Text.Emoji.DataFiles.EmojiTest , Text.Emoji.Types , Text.Emoji.String , Text.Emoji.Text diff --git a/src/Text/Emoji/DataFiles.hs b/src/Text/Emoji/DataFiles.hs deleted file mode 100644 index d99646a..0000000 --- a/src/Text/Emoji/DataFiles.hs +++ /dev/null @@ -1,125 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} -{-| - Module: Text.Emoji.Data - Description: --} - - -module Text.Emoji.DataFiles where - -import Prelude hiding (takeWhile) - -import Text.Emoji.Types - -import Control.Applicative ((<|>)) -import Data.Attoparsec.Text (Parser (), takeWhile1, takeWhile, string, choice - , notInClass, skipWhile, skipMany, isHorizontalSpace - , decimal, hexadecimal, char, many1, endOfLine) -import Data.Text (Text) -import qualified Data.Text as T -import Data.Word (Word32) - -type EmojiTest = [EmojiTestEntry] - -data EmojiTestEntry - = Group EmojiTestGroupLevel Text [EmojiTestEntry] - | Entry [Word32] EmojiStatus EmojiVersion Text - | Comment Text - deriving (Show, Eq, Ord) - -data EmojiTestGroupLevel - = EmojiTestGroup - | EmojiTestSubgroup - deriving (Show, Eq, Ord, Enum) - -groupLevelText :: EmojiTestGroupLevel -> Text -groupLevelText EmojiTestGroup = "group" -groupLevelText EmojiTestSubgroup = "subgroup" - -notSpace :: Char -> Bool -notSpace = notInClass " \t" - -notEol :: Char -> Bool -notEol = notInClass "\n" - -skipSpace :: Parser () -skipSpace = skipWhile isHorizontalSpace - -codePointsColumn :: Parser [Word32] -codePointsColumn = many1 (hexadecimal <* char ' ') - -statusColumn :: Parser EmojiStatus -statusColumn = - (string "fully-qualified" >> pure (EmojiStatusCharacter FullyQualified)) <|> - (string "minimally-qualified" >> pure (EmojiStatusCharacter MinimallyQualified)) <|> - (string "unqualified" >> pure (EmojiStatusCharacter Unqualified)) <|> - (string "component" >> pure (EmojiStatusComponent)) - -emojiTestGroup :: EmojiTestGroupLevel -> Parser EmojiTestEntry -emojiTestGroup maxLevel = do - _ <- char '#' - skipSpace - - _ <- string $ groupLevelText maxLevel - _ <- char ':' - skipSpace - - name <- takeWhile1 notEol - skipMany endOfLine - - let groupParser = - if maxLevel == EmojiTestGroup - then [ emojiTestGroup EmojiTestSubgroup ] - else [] - - groupEntries <- many1 . choice $ - groupParser ++ [ emojiTestEntryLine, emojiTestCommentLine ] - - pure $ Group EmojiTestGroup name groupEntries - -emojiVersionColumn :: Parser EmojiVersion -emojiVersionColumn = do - _ <- char 'E' - major <- decimal - _ <- char '.' - minor <- decimal - pure $ case major of - 0 -> case minor of - -- E0.0: pre emoji without specific Unicode Version - 0 -> NoEmojiVersion Nothing - -- E0.x: Pre emoji with Unicode Version - _ -> NoEmojiVersion (Just minor) - -- Ex.y: Regular Emoji Version - _ -> EmojiVersion major minor - -emojiTestEntryLine :: Parser EmojiTestEntry -emojiTestEntryLine = do - codePoints <- codePointsColumn - skipSpace - - _ <- string "; " - status <- statusColumn - skipSpace - - _ <- string "# " - skipWhile (notInClass "E") - version <- emojiVersionColumn - skipSpace - - shortName <- takeWhile1 notEol - skipMany endOfLine - - pure $ Entry codePoints status version shortName - -emojiTestCommentLine :: Parser EmojiTestEntry -emojiTestCommentLine = do - _ <- char '#' - skipSpace - text <- takeWhile notEol <* skipMany endOfLine - if "group:" `T.isPrefixOf` text || "subgroup:" `T.isPrefixOf` text - then fail "group, not comment" - else pure $ Comment text - -emojiTestFile :: Parser EmojiTest -emojiTestFile = many1 $ - emojiTestGroup EmojiTestGroup <|> emojiTestEntryLine <|> emojiTestCommentLine diff --git a/src/Text/Emoji/DataFiles/EmojiTest.hs b/src/Text/Emoji/DataFiles/EmojiTest.hs new file mode 100644 index 0000000..ca54d6c --- /dev/null +++ b/src/Text/Emoji/DataFiles/EmojiTest.hs @@ -0,0 +1,142 @@ +{-# LANGUAGE OverloadedStrings #-} +{-| + Module: Text.Emoji.DataFiles.EmojiTest + Description: Parsers and Utilities related to the @emoji-test.txt@ file. + + This module can be utilized to parse the @emoji-test.txt@ data file + provided by the Unicode Consortium. This file contains a representation + of a more or less typical emoji keyboard which means you get: + + * a list of valid emojis (qualified) plus unqualified ones, + emoji components, you can expect to find in the wild or + may want to input + * an organization of those into groups and subgroups (like in a keyboard) +-} + + +module Text.Emoji.DataFiles.EmojiTest + ( -- * Representation of the parsed file + EmojiTest () + , EmojiTestEntry (..) + , EmojiTestGroupLevel (..) + -- * Attoparsec parsers + , emojiVersionColumn + , emojiTestFile + ) where + +import Prelude hiding (takeWhile) + +import Text.Emoji.Types + +import Control.Applicative ((<|>)) +import Data.Attoparsec.Text (Parser (), takeWhile1, takeWhile, string, choice + , notInClass, skipWhile, skipMany, isHorizontalSpace + , decimal, hexadecimal, char, many1, endOfLine) +import Data.Text (Text) +import qualified Data.Text as T +import Data.Word (Word32) + +type EmojiTest = [EmojiTestEntry] + +data EmojiTestEntry + = Group EmojiTestGroupLevel Text [EmojiTestEntry] + | Entry [Word32] EmojiStatus EmojiVersion Text + | Comment Text + deriving (Show, Eq, Ord) + +data EmojiTestGroupLevel + = EmojiTestGroup + | EmojiTestSubgroup + deriving (Show, Eq, Ord, Enum) + +groupLevelText :: EmojiTestGroupLevel -> Text +groupLevelText EmojiTestGroup = "group" +groupLevelText EmojiTestSubgroup = "subgroup" + +notEol :: Char -> Bool +notEol = notInClass "\n" + +skipSpace :: Parser () +skipSpace = skipWhile isHorizontalSpace + +codePointsColumn :: Parser [Word32] +codePointsColumn = many1 (hexadecimal <* char ' ') + +statusColumn :: Parser EmojiStatus +statusColumn = + (string "fully-qualified" >> pure (EmojiStatusCharacter FullyQualified)) <|> + (string "minimally-qualified" >> pure (EmojiStatusCharacter MinimallyQualified)) <|> + (string "unqualified" >> pure (EmojiStatusCharacter Unqualified)) <|> + (string "component" >> pure (EmojiStatusComponent)) + +emojiTestGroup :: EmojiTestGroupLevel -> Parser EmojiTestEntry +emojiTestGroup maxLevel = do + _ <- char '#' + skipSpace + + _ <- string $ groupLevelText maxLevel + _ <- char ':' + skipSpace + + name <- takeWhile1 notEol + skipMany endOfLine + + let groupParser = + if maxLevel == EmojiTestGroup + then [ emojiTestGroup EmojiTestSubgroup ] + else [] + + groupEntries <- many1 . choice $ + groupParser ++ [ emojiTestEntryLine, emojiTestCommentLine ] + + pure $ Group EmojiTestGroup name groupEntries + +-- | Parses the textual representation of an 'EmojiVersion' as found +-- in @emoji-test.txt@. +emojiVersionColumn :: Parser EmojiVersion +emojiVersionColumn = do + _ <- char 'E' + major <- decimal + _ <- char '.' + minor <- decimal + pure $ case major of + 0 -> case minor of + -- E0.0: pre emoji without specific Unicode Version + 0 -> NoEmojiVersion Nothing + -- E0.x: Pre emoji with Unicode Version + _ -> NoEmojiVersion (Just minor) + -- Ex.y: Regular Emoji Version + _ -> EmojiVersion major minor + +emojiTestEntryLine :: Parser EmojiTestEntry +emojiTestEntryLine = do + codePoints <- codePointsColumn + skipSpace + + _ <- string "; " + status <- statusColumn + skipSpace + + _ <- string "# " + skipWhile (notInClass "E") + version <- emojiVersionColumn + skipSpace + + shortName <- takeWhile1 notEol + skipMany endOfLine + + pure $ Entry codePoints status version shortName + +emojiTestCommentLine :: Parser EmojiTestEntry +emojiTestCommentLine = do + _ <- char '#' + skipSpace + text <- takeWhile notEol <* skipMany endOfLine + if "group:" `T.isPrefixOf` text || "subgroup:" `T.isPrefixOf` text + then fail "group, not comment" + else pure $ Comment text + +-- | Parses an entire @emoji-test.txt@ file or a subset of it. +emojiTestFile :: Parser EmojiTest +emojiTestFile = many1 $ + emojiTestGroup EmojiTestGroup <|> emojiTestEntryLine <|> emojiTestCommentLine diff --git a/test/Main.hs b/test/Main.hs index 76942e2..a375968 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -9,7 +9,7 @@ import Test.Tasty.HUnit ((@?=), (@=?)) import Test.SmallCheck.Series import Text.Emoji.Types -import Text.Emoji.DataFiles +import Text.Emoji.DataFiles.EmojiTest import Data.Attoparsec.Text (parseOnly) import Data.Either (fromRight) -- cgit 1.4.1