pty.Parser: parse and filter out CSI escape sequences

Add a parser for CSI escape sequences as specified in ECMA-48 (5th edition). All other escape sequences starting with \033 (like OSC escape sequences) aren't supported yet and will be added to the terminal's buffer like before. No CSI escape sequence generates a dedicated event so far, the only effect of them being parsed is that they are omitted from the output in the terminal. This makes the output of programs making heavy use of e. g. SGR escape sequences much more readable. If the end of input is hit while parsing an escape sequence, we postpone parsing it until the next invocation of parse(). Additionally we rely on backtracking to deal with unknown and invalid escape sequences: By setting a hint that the sequence is to be ignored and backtracking, the rest of the parser behaves normally and emits the invalid/unknown sequence as normal text.
author: sterni <sternenseemann@systemli.org> 2021-07-03 16:18:00 +0200
committer: Sören Tempel <soeren+git@soeren-tempel.net> 2021-07-15 13:28:33 +0200
commit: a33f9be15276ebe217fe14fa75f1848533dff4e8 (patch)
tree: fabc9c0bacd4b6e5c0dd1ff84f2481ebb28b8588
parent: 21827d91b3a20480f8c1328e943b060a2c227feb (diff)
1 files changed, 83 insertions, 0 deletions
diff --git a/saneterm/pty.py b/saneterm/pty.py
index 5396d82..d5ed7d3 100644
--- a/saneterm/pty.py
+++ b/saneterm/pty.py
@@ -174,6 +174,30 @@ class PositionedIterator(object):
             self.pos -= 1
             raise StopIteration
 
+def csi_parameter_byte(c):
+    """
+    Check if the given unicode character is a CSI sequence
+    parameter byte. See ECMA-48 (5th edition) Section 5.4.
+    """
+    cp = ord(c)
+    return cp >= 0x30 and cp <= 0x3f
+
+def csi_intermediate_byte(c):
+    """
+    Check if the given unicode character is a CSI sequence
+    intermediate byte. See ECMA-48 (5th edition) Section 5.4.
+    """
+    cp = ord(c)
+    return cp >= 0x20 and cp <= 0x2f
+
+def csi_final_byte(c):
+    """
+    Check if the given unicode character is a CSI sequence
+    final byte. See ECMA-48 (5th edition) Section 5.4.
+    """
+    cp = ord(c)
+    return cp >= 0x40 and cp <= 0x7e
+
 class Parser(object):
     """
     Parses a subset of special control sequences read from
@@ -200,6 +224,13 @@ class Parser(object):
           the bell character '\a' was in the terminal input.
           This usually should trigger the machine to beep
           and/or the window to set the urgent flag.
+
+        Parsed control sequences are guaranteed to never
+        appear in a TEXT event. This is also true for
+        escape sequences which don't cause an event to
+        be generated. This is true for all CSI escape
+        sequences at the moment which are filtered out
+        from saneterm's output in this way.
         """
 
         it = PositionedIterator(self.__leftover + input)
@@ -209,6 +240,11 @@ class Parser(object):
         # we want to emit as a TEXT event
         start = 0
 
+        # this is set by the parser before backtracking if
+        # an ANSI escape sequence should be ignored, e. g.
+        # if we don't support it
+        ignore_esc = False
+
         # we expect a decoded string as input,
         # so we don't need to handle incremental
         # decoding here as well
@@ -228,6 +264,53 @@ class Parser(object):
             if code == '\a':
                 flush_until = it.pos
                 special_ev = (EventType.BELL, None)
+            elif code == '\033':
+                # ignore_esc can be set if we encounter a '\033'
+                # which is followed by a sequence we don't understand.
+                # In that case we'll jump back to the '\033', but just
+                # treat it as if it was an ordinary character.
+                if ignore_esc:
+                    ignore_esc = False
+                else:
+                    flush_until = it.pos
+
+                    # if parsing fails we'll return to this point
+                    it.waypoint()
+
+                    try:
+                        if it.next() == '[':
+                            # CSI sequence
+                            try:
+                                params = it.takewhile_greedy(csi_parameter_byte)
+                                inters = it.takewhile_greedy(csi_intermediate_byte)
+                                final = it.next()
+
+                                assert csi_final_byte(final)
+
+                            except AssertionError:
+                                # invalid CSI sequence, we'll render it as text for now
+                                ignore_esc = True
+
+                        else:
+                            # we only parse CSI sequences for now, all other
+                            # sequences will be rendered as text to the terminal.
+                            # This probably should change in the future since
+                            # we also want to filter out, e. g. OSC sequences
+                            ignore_esc = True
+
+                        # with only backtracks if the end of input is
+                        # reached, so we do need to do it explicitly here.
+                        if ignore_esc:
+                            it.backtrack()
+
+                    except StopIteration:
+                        # the full escape sequence wasn't contained in
+                        # this chunk of input, so we'll parse it next time.
+                        # Since we flush up to the escape sequence, we know
+                        # where it started. The parser loop will exit at the
+                        # end of this iteration because the iterator is
+                        # exhausted.
+                        self.__leftover = it.wrapped[flush_until:]
 
             # at the end of input, flush if we aren't already
             if flush_until == None and it.empty():
author	sterni <sternenseemann@systemli.org>	2021-07-03 16:18:00 +0200
committer	Sören Tempel <soeren+git@soeren-tempel.net>	2021-07-15 13:28:33 +0200
commit	a33f9be15276ebe217fe14fa75f1848533dff4e8 (patch)
tree	fabc9c0bacd4b6e5c0dd1ff84f2481ebb28b8588
parent	21827d91b3a20480f8c1328e943b060a2c227feb (diff)