about summary refs log tree commit diff
diff options
context:
space:
mode:
authorsternenseemann <git@lukasepple.de>2020-04-12 21:03:41 +0200
committersternenseemann <git@lukasepple.de>2020-04-12 21:03:41 +0200
commite81d77b968bbe0b514352606fea163c0fae4668e (patch)
tree48f5bf913fb5f1e8384ef855c718b680eacd0db9
parentbeeb3420354b56dff691a8f4b4f54bb1b683adad (diff)
add parser for emoji-test.txt
-rw-r--r--emoji-generic.cabal6
-rw-r--r--emoji-generic.nix8
-rw-r--r--src/Text/Emoji/DataFiles.hs126
-rw-r--r--test/Main.hs32
4 files changed, 167 insertions, 5 deletions
diff --git a/emoji-generic.cabal b/emoji-generic.cabal
index 96d084d..23362c0 100644
--- a/emoji-generic.cabal
+++ b/emoji-generic.cabal
@@ -17,13 +17,14 @@ extra-source-files:  README.md
 
 library
   exposed-modules:     Text.Emoji
+                     , Text.Emoji.DataFiles
                      , Text.Emoji.Types
                      , Text.Emoji.String
                      , Text.Emoji.Text
   -- other-modules:
   -- other-extensions:
   build-depends:       base >=4.13 && <4.14
-                     , text
+                     , text ^>=1.2.4.0
                      , attoparsec ^>=0.13.2.0
                      , file-embed
                      , utf8-light ^>=0.4.2
@@ -40,4 +41,7 @@ test-suite test
                      , tasty ^>=1.2.3
                      , tasty-smallcheck ^>=0.8.1
                      , smallcheck ^>=1.1.5
+                     , tasty-hunit ^>=0.10.0.2
+                     , attoparsec ^>=0.13.2.0
+                     , text ^>=1.2.4.0
   default-language:    Haskell2010
diff --git a/emoji-generic.nix b/emoji-generic.nix
index 4da50d5..c187749 100644
--- a/emoji-generic.nix
+++ b/emoji-generic.nix
@@ -1,5 +1,5 @@
-{ mkDerivation, attoparsec, base, file-embed, stdenv, tasty
-, tasty-smallcheck, text, utf8-light
+{ mkDerivation, attoparsec, base, file-embed, smallcheck, stdenv
+, tasty, tasty-hunit, tasty-smallcheck, text, utf8-light
 }:
 mkDerivation {
   pname = "emoji-generic";
@@ -8,7 +8,9 @@ mkDerivation {
   libraryHaskellDepends = [
     attoparsec base file-embed text utf8-light
   ];
-  testHaskellDepends = [ tasty tasty-smallcheck ];
+  testHaskellDepends = [
+    base smallcheck tasty tasty-hunit tasty-smallcheck
+  ];
   description = "A generic Emoji library";
   license = stdenv.lib.licenses.lgpl3;
 }
diff --git a/src/Text/Emoji/DataFiles.hs b/src/Text/Emoji/DataFiles.hs
new file mode 100644
index 0000000..0e2755c
--- /dev/null
+++ b/src/Text/Emoji/DataFiles.hs
@@ -0,0 +1,126 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-|
+  Module:      Text.Emoji.Data
+  Description:
+-}
+
+
+module Text.Emoji.DataFiles where
+
+import Prelude hiding (takeWhile)
+
+import Text.Emoji.Types
+
+import Control.Applicative ((<|>))
+import Data.Attoparsec.Text (Parser (..), takeWhile1, takeWhile, string
+                            , notInClass, skipWhile, skipMany, isHorizontalSpace
+                            , decimal, hexadecimal, char, many1, endOfLine)
+import Data.Text (Text)
+import Data.Word (Word32)
+
+type EmojiTest = [EmojiTestEntry]
+
+data EmojiTestEntry
+  = Group EmojiTestGroupLevel Text [EmojiTestEntry]
+  | Entry [Word32] EmojiStatus EmojiVersion Text
+  | Comment Text
+  deriving (Show, Eq, Ord)
+
+data EmojiTestGroupLevel
+  = EmojiTestGroup
+  | EmojiTestSubgroup
+  deriving (Show, Eq, Ord, Enum)
+
+groupLevelText :: EmojiTestGroupLevel -> Text
+groupLevelText EmojiTestGroup = "group"
+groupLevelText EmojiTestSubgroup = "subgroup"
+
+notSpace :: Char -> Bool
+notSpace = notInClass " \t"
+
+notEol :: Char -> Bool
+notEol = notInClass "\n"
+
+skipSpace :: Parser ()
+skipSpace = skipWhile isHorizontalSpace
+
+codePointsColumn :: Parser [Word32]
+codePointsColumn = many1 (hexadecimal <* char ' ')
+
+statusColumn :: Parser EmojiStatus
+statusColumn =
+  (string "fully-qualified" >> pure (EmojiStatusCharacter FullyQualified)) <|>
+  (string "minimally-qualified" >> pure (EmojiStatusCharacter MinimallyQualified)) <|>
+  (string "unqualified" >> pure (EmojiStatusCharacter Unqualified)) <|>
+  (string "component" >> pure (EmojiStatusComponent))
+
+emojiTestGroup :: EmojiTestGroupLevel -> Parser EmojiTestEntry
+emojiTestGroup maxLevel = do
+  char '#'
+  skipSpace
+
+  string $ groupLevelText maxLevel
+  char ':'
+  skipSpace
+
+  name <- takeWhile1 notEol
+  skipMany endOfLine
+
+  let addMiddleParser = if maxLevel == EmojiTestGroup
+                          then (<|> emojiTestGroup EmojiTestSubgroup)
+                          else id
+  groupEntries <- many1
+    (addMiddleParser emojiTestEntryLine <|> emojiTestCommentLine)
+
+  pure $ Group EmojiTestGroup name groupEntries
+
+emojiVersionColumn :: Parser EmojiVersion
+emojiVersionColumn = do
+  char 'E'
+  major <- decimal
+  char '.'
+  minor <- decimal
+  pure $ case major of
+           0 -> case minor of
+                  -- E0.0: pre emoji without specific Unicode Version
+                  0 -> NoEmojiVersion Nothing
+                  -- E0.x: Pre emoji with Unicode Version
+                  _ -> NoEmojiVersion (Just minor)
+           -- Ex.y: Regular Emoji Version
+           _ -> EmojiVersion major minor
+
+emojiTestEntryLine :: Parser EmojiTestEntry
+emojiTestEntryLine = do
+  codePoints <- codePointsColumn
+  skipSpace
+
+  string "; "
+  status <- statusColumn
+  skipSpace
+
+  string "# "
+  skipWhile (notInClass "E")
+  version <- emojiVersionColumn
+  skipSpace
+
+  shortName <- takeWhile1 notEol
+  skipMany endOfLine
+
+  pure $ Entry codePoints status version shortName
+
+emojiTestCommentLine :: Parser EmojiTestEntry
+emojiTestCommentLine = char '#' >> skipSpace >>
+  (((string "group:" <|> string "subgroup:") >> fail "group, not comment") <|>
+    (takeWhile notEol <* skipMany endOfLine) >>= pure . Comment)
+
+emojiTestFile :: Parser EmojiTest
+emojiTestFile = many1 $
+  emojiTestGroup EmojiTestGroup <|> emojiTestEntryLine <|> emojiTestCommentLine
+
+-- | Helper Function that counts number of lines used to parse 'EmojiTest'.
+--   Useful to check against LoC of @emoji-test.txt@ for parser sanity check.
+countLines :: EmojiTest -> Integer
+countLines ((Group _ _ x):xs) = 1 + countLines x + countLines xs
+countLines ((Comment _):xs) = 1 + countLines xs
+countLines ((Entry _ _ _ _):xs) = 1 + countLines xs
+countLines [] = 0
diff --git a/test/Main.hs b/test/Main.hs
index 9493099..3706de8 100644
--- a/test/Main.hs
+++ b/test/Main.hs
@@ -1,17 +1,25 @@
+{-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE FlexibleInstances, MultiParamTypeClasses #-}
 module Main where
 
 import Test.Tasty
 import qualified Test.Tasty.SmallCheck as SC
+import qualified Test.Tasty.HUnit as HU
 import Test.SmallCheck.Series
 
 import Text.Emoji.Types
+import Text.Emoji.DataFiles
+
+import Data.Attoparsec.Text (parseOnly)
+import Data.Either (fromRight)
+import Data.Text (Text)
+import qualified Data.Text as T (unpack)
 
 main :: IO ()
 main = defaultMain tests
 
 tests :: TestTree
-tests = testGroup "Tests" [typesTests]
+tests = testGroup "Tests" [typesTests, parserTests]
 
 typesTests :: TestTree
 typesTests = testGroup "Emoji Types Tests"
@@ -26,3 +34,25 @@ emojiVersionLowestValue :: EmojiVersion -> Bool
 emojiVersionLowestValue v = NoEmojiVersion Nothing <= v
 
 instance Monad m => Serial m EmojiVersion
+
+parserTests :: TestTree
+parserTests = testGroup "Emoji Data File Parser Tests"
+  [ HU.testCase "EmojiVersion Parser parses all versions up till now correctly" emojiVersionParserTest ]
+
+emojiVersionParserTest :: HU.Assertion
+emojiVersionParserTest = do
+  vt "E0.0" (NoEmojiVersion Nothing)
+  vt "E0.6" (NoEmojiVersion (Just 6))
+  vt "E0.7" (NoEmojiVersion (Just 7))
+  vt "E1.0" (EmojiVersion 1 0)
+  vt "E2.0" (EmojiVersion 2 0)
+  vt "E3.0" (EmojiVersion 3 0)
+  vt "E4.0" (EmojiVersion 4 0)
+  vt "E5.0" (EmojiVersion 5 0)
+  vt "E11.0" (EmojiVersion 11 0)
+  vt "E12.0" (EmojiVersion 12 0)
+  vt "E12.1" (EmojiVersion 12 1)
+  vt "E13.0" (EmojiVersion 13 0)
+  where vt :: Text -> EmojiVersion -> HU.Assertion
+        vt str exp = HU.assertBool (T.unpack str) . fromRight False $
+          (== exp) <$> parseOnly emojiVersionColumn str