about summary refs log tree commit diff
path: root/src/Text/Emoji/DataFiles/EmojiTest.hs
blob: 9dd2428d46c794ff1cc1bd4d3b6d90a9eae4e0c5 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
{-# LANGUAGE OverloadedStrings #-}
{-|
  Module:      Text.Emoji.DataFiles.EmojiTest
  Description: Parsers and Utilities related to the @emoji-test.txt@ file.

  This module can be utilized to parse the @emoji-test.txt@ data file
  provided by the Unicode Consortium. This file contains a representation
  of a more or less typical emoji keyboard which means you get:

  * a list of valid emojis (qualified) plus unqualified ones,
    emoji components, you can expect to find in the wild or
    may want to input
  * an organization of those into groups and subgroups (like in a keyboard)
-}


module Text.Emoji.DataFiles.EmojiTest
  ( -- * Representation of the parsed file
    EmojiTest ()
  , EmojiTestEntry (..)
  , EmojiTestGroupLevel (..)
    -- * Attoparsec parsers
  , emojiVersionColumn
  , emojiTestFile
  , manyEmojiTestEntries
  ) where

import Prelude hiding (takeWhile)

import Text.Emoji.Types

import Control.Applicative ((<|>))
import Data.Attoparsec.Text (Parser (), takeWhile1, takeWhile, string, choice
                            , notInClass, skipWhile, skipMany, isHorizontalSpace
                            , decimal, hexadecimal, char, many1, endOfLine, endOfInput)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Word (Word32)

type EmojiTest = [EmojiTestEntry]

data EmojiTestEntry
  = Group EmojiTestGroupLevel Text [EmojiTestEntry]
  | Entry [Word32] EmojiStatus EmojiVersion Text
  | Comment Text
  deriving (Show, Eq, Ord)

data EmojiTestGroupLevel
  = EmojiTestGroup
  | EmojiTestSubgroup
  deriving (Show, Eq, Ord, Enum)

groupLevelText :: EmojiTestGroupLevel -> Text
groupLevelText EmojiTestGroup = "group"
groupLevelText EmojiTestSubgroup = "subgroup"

notEol :: Char -> Bool
notEol = notInClass "\n"

skipSpace :: Parser ()
skipSpace = skipWhile isHorizontalSpace

codePointsColumn :: Parser [Word32]
codePointsColumn = many1 (hexadecimal <* char ' ')

statusColumn :: Parser EmojiStatus
statusColumn =
  (string "fully-qualified" >> pure (EmojiStatusCharacter FullyQualified)) <|>
  (string "minimally-qualified" >> pure (EmojiStatusCharacter MinimallyQualified)) <|>
  (string "unqualified" >> pure (EmojiStatusCharacter Unqualified)) <|>
  (string "component" >> pure (EmojiStatusComponent))

emojiTestGroup :: EmojiTestGroupLevel -> Parser EmojiTestEntry
emojiTestGroup maxLevel = do
  _ <- char '#'
  skipSpace

  _ <- string $ groupLevelText maxLevel
  _ <- char ':'
  skipSpace

  name <- takeWhile1 notEol
  skipMany endOfLine

  let groupParser =
        if maxLevel == EmojiTestGroup
          then [ emojiTestGroup EmojiTestSubgroup ]
          else []

  groupEntries <- many1 . choice $
    groupParser ++ [ emojiTestEntryLine, emojiTestCommentLine ]

  pure $ Group EmojiTestGroup name groupEntries

-- | Parses the textual representation of an 'EmojiVersion' as found
--   in @emoji-test.txt@.
emojiVersionColumn :: Parser EmojiVersion
emojiVersionColumn = do
  _ <- char 'E'
  major <- decimal
  _ <- char '.'
  minor <- decimal
  pure $ case major of
           0 -> case minor of
                  -- E0.0: pre emoji without specific Unicode Version
                  0 -> NoEmojiVersion Nothing
                  -- E0.x: Pre emoji with Unicode Version
                  _ -> NoEmojiVersion (Just minor)
           -- Ex.y: Regular Emoji Version
           _ -> EmojiVersion major minor

emojiTestEntryLine :: Parser EmojiTestEntry
emojiTestEntryLine = do
  codePoints <- codePointsColumn
  skipSpace

  _ <- string "; "
  status <- statusColumn
  skipSpace

  _ <- string "# "
  skipWhile (notInClass "E")
  version <- emojiVersionColumn
  skipSpace

  shortName <- takeWhile1 notEol
  skipMany endOfLine

  pure $ Entry codePoints status version shortName

emojiTestCommentLine :: Parser EmojiTestEntry
emojiTestCommentLine = do
  _ <- char '#'
  skipSpace
  text <- takeWhile notEol <* skipMany endOfLine
  if "group:" `T.isPrefixOf` text || "subgroup:" `T.isPrefixOf` text
    then fail "group, not comment"
    else pure $ Comment text

-- | Parses an entire @emoji-test.txt@ file and consumes the end of input.
emojiTestFile :: Parser EmojiTest
emojiTestFile = manyEmojiTestEntries <* endOfInput

-- | Parses one or more 'EmojiTestEntries'.
manyEmojiTestEntries :: Parser EmojiTest
manyEmojiTestEntries = many1 $
  emojiTestGroup EmojiTestGroup <|> emojiTestEntryLine <|> emojiTestCommentLine