src/Text/Emoji/DataFiles.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

{-# LANGUAGE OverloadedStrings #-}
{-|
  Module:      Text.Emoji.Data
  Description:
-}


module Text.Emoji.DataFiles where

import Prelude hiding (takeWhile)

import Text.Emoji.Types

import Control.Applicative ((<|>))
import Data.Attoparsec.Text (Parser (..), takeWhile1, takeWhile, string, choice
                            , notInClass, skipWhile, skipMany, isHorizontalSpace
                            , decimal, hexadecimal, char, many1, endOfLine)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Word (Word32)

type EmojiTest = [EmojiTestEntry]

data EmojiTestEntry
  = Group EmojiTestGroupLevel Text [EmojiTestEntry]
  | Entry [Word32] EmojiStatus EmojiVersion Text
  | Comment Text
  deriving (Show, Eq, Ord)

data EmojiTestGroupLevel
  = EmojiTestGroup
  | EmojiTestSubgroup
  deriving (Show, Eq, Ord, Enum)

groupLevelText :: EmojiTestGroupLevel -> Text
groupLevelText EmojiTestGroup = "group"
groupLevelText EmojiTestSubgroup = "subgroup"

notSpace :: Char -> Bool
notSpace = notInClass " \t"

notEol :: Char -> Bool
notEol = notInClass "\n"

skipSpace :: Parser ()
skipSpace = skipWhile isHorizontalSpace

codePointsColumn :: Parser [Word32]
codePointsColumn = many1 (hexadecimal <* char ' ')

statusColumn :: Parser EmojiStatus
statusColumn =
  (string "fully-qualified" >> pure (EmojiStatusCharacter FullyQualified)) <|>
  (string "minimally-qualified" >> pure (EmojiStatusCharacter MinimallyQualified)) <|>
  (string "unqualified" >> pure (EmojiStatusCharacter Unqualified)) <|>
  (string "component" >> pure (EmojiStatusComponent))

emojiTestGroup :: EmojiTestGroupLevel -> Parser EmojiTestEntry
emojiTestGroup maxLevel = do
  char '#'
  skipSpace

  string $ groupLevelText maxLevel
  char ':'
  skipSpace

  name <- takeWhile1 notEol
  skipMany endOfLine

  let groupParser =
        if maxLevel == EmojiTestGroup
          then [ emojiTestGroup EmojiTestSubgroup ]
          else []

  groupEntries <- many1 . choice $
    groupParser ++ [ emojiTestEntryLine, emojiTestCommentLine ]

  pure $ Group EmojiTestGroup name groupEntries

emojiVersionColumn :: Parser EmojiVersion
emojiVersionColumn = do
  char 'E'
  major <- decimal
  char '.'
  minor <- decimal
  pure $ case major of
           0 -> case minor of
                  -- E0.0: pre emoji without specific Unicode Version
                  0 -> NoEmojiVersion Nothing
                  -- E0.x: Pre emoji with Unicode Version
                  _ -> NoEmojiVersion (Just minor)
           -- Ex.y: Regular Emoji Version
           _ -> EmojiVersion major minor

emojiTestEntryLine :: Parser EmojiTestEntry
emojiTestEntryLine = do
  codePoints <- codePointsColumn
  skipSpace

  string "; "
  status <- statusColumn
  skipSpace

  string "# "
  skipWhile (notInClass "E")
  version <- emojiVersionColumn
  skipSpace

  shortName <- takeWhile1 notEol
  skipMany endOfLine

  pure $ Entry codePoints status version shortName

emojiTestCommentLine :: Parser EmojiTestEntry
emojiTestCommentLine = do
  char '#'
  skipSpace
  text <- takeWhile notEol <* skipMany endOfLine
  if "group:" `T.isPrefixOf` text || "subgroup:" `T.isPrefixOf` text
    then fail "group, not comment"
    else pure $ Comment text

emojiTestFile :: Parser EmojiTest
emojiTestFile = many1 $
  emojiTestGroup EmojiTestGroup <|> emojiTestEntryLine <|> emojiTestCommentLine

-- | Helper Function that counts number of lines used to parse 'EmojiTest'.
--   Useful to check against LoC of @emoji-test.txt@ for parser sanity check.
countLines :: EmojiTest -> Integer
countLines ((Group _ _ x):xs) = 1 + countLines x + countLines xs
countLines ((Comment _):xs) = 1 + countLines xs
countLines ((Entry _ _ _ _):xs) = 1 + countLines xs
countLines [] = 0