Source code for parce.tree

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


"""
This module defines the tree structure a text is parsed into.

A tree consists of Context and Token objects. (Both inherit from the base
class Node, which defines the shared methods and properties.)

A :class:`Context` is a list containing Tokens and other Contexts. A Context is
created when a lexicon becomes active. A Context knows its parent Context and
its lexicon.

A :class:`Token` represents one parsed piece of text. A Token is created when a
rule in the lexicon matches. A Token knows its parent Context, its position in
the text and the action that was specified in the rule.

A Context is always non-empty, except for the root Context, which represents
the root lexicon and can be empty if the document did not generate a single
token.

The tree structure is easy to navigate, no special objects or iterators are
necessary for that. To find a token at a certain position in a context, use
:meth:`Context.find_token` and its relatives. From every node you can iterate
:meth:`~Node.forward` and :meth:`~Node.backward`. Use the methods like
:meth:`~Node.left_siblings` and :meth:`~Node.right_siblings` to traverse the
current context.

"""


import itertools
import reprlib
import weakref

from parce import util
from parce import query
from parce.lexicon import Lexicon


DUMP_STYLES = {
    "ascii":   (" | ", "   ", " |-", " `-"),
    "round":   (" │ ", "   ", " ├╴", " ╰╴"),
    "square":  (" │ ", "   ", " ├╴", " └╴"),
    "double":  (" ║ ", "   ", " ╠═", " ╚═"),
    "thick":   (" ┃ ", "   ", " ┣╸", " ┗╸"),
    "flat":    ("│", " ", "├", "╰"),
}

DUMP_STYLE_DEFAULT = "round"


[docs]class Node: """Methods that are shared by Token and Context.""" __slots__ = ('__weakref__',) is_token = False is_context = False @property def parent(self): """The parent Context (or None; uses a weak reference).""" return self._parent() @parent.setter def parent(self, parent): """Set the parent (to a Context or None).""" self._parent = weakref.ref(parent) if parent is not None else lambda: None @parent.deleter def parent(self): """Set the parent to None.""" self._parent = lambda: None
[docs] def copy(self, parent=None): """Return a copy of the Node, but with the specified parent.""" raise NotImplementedError
[docs] def dump(self, file=None, style=None, depth=0): """Display a graphical representation of the node and its contents. The file object defaults to stdout, and the style to "round". You can choose any style that's in the ``DUMP_STYLES`` dictionary. """ i = 2 d = DUMP_STYLES[style or DUMP_STYLE_DEFAULT] prefix = [] node = self for _ in range(depth): prefix.append(d[i + int(node.is_last())]) node = node.parent i = 0 print("".join(reversed(prefix)) + repr(self), file=file) if self.is_context: for n in self: n.dump(file, style, depth + 1)
@property def pwd(self): """Show the ancestry, for debugging purposes.""" nodes = [self] nodes.extend(self.ancestors()) nodes.reverse() d = DUMP_STYLES[DUMP_STYLE_DEFAULT] for n, node in enumerate(nodes): print(''.join(( d[1] * max(0, n-1), d[3] if n else '', repr(node), " [{}]".format(nodes[n-1].index(node)) if n else '', )))
[docs] def parent_index(self): """Return our index in the parent. This is recommended above using parent.index(self), because this method finds our index using a binary search on position, while the latter is a linear search, which is certainly slower with a large number of children. """ p = self.parent pos = self.pos lo = 0 hi = len(p) while lo < hi: mid = (lo + hi) // 2 n = p[mid] if n.pos < pos: lo = mid + 1 elif n is self: return mid else: hi = mid return lo
[docs] def root(self): """Return the root node.""" root = self for root in self.ancestors(): pass return root
[docs] def is_last(self): """Return True if this Node is the last child of its parent. Fails if called on the root element. """ return self.parent[-1] is self
[docs] def is_first(self): """Return True if this Node is the first child of its parent. Fails if called on the root element. """ return self.parent[0] is self
[docs] def is_ancestor_of(self, node): """Return True if this Node is an ancestor of the other Node.""" for n in node.ancestors(): if n is self: return True return False
[docs] def ancestors(self, upto=None): """Climb the tree up over the parents. If upto is given and it is one of the ancestors, stop after yielding that ancestor. Otherwise iteration stops at the root node. """ node = self.parent if upto and upto.parent is not None: p = upto.parent while node is not None and node is not p: yield node node = node.parent else: while node is not None: yield node node = node.parent
[docs] def ancestors_with_index(self, upto=None): """Yield the ancestors(upto), and the index of each node in the parent.""" n = self for p in self.ancestors(upto): yield p, n.parent_index() n = p
[docs] def common_ancestor(self, other): """Return the common ancestor with the Context or Token.""" ancestors = [] if self.is_context: ancestors.append(self) ancestors.extend(self.ancestors()) if other.is_context and other in ancestors: return other for n in other.ancestors(): if n in ancestors: return n
[docs] def depth(self): """Return the number of ancestors.""" return sum(1 for n in self.ancestors())
[docs] def left_sibling(self): """Return the left sibling of this node, if any. Does not descend in child nodes or ascend upto the parent. Fails if called on the root node. """ if self.parent[0] is not self: i = self.parent_index() return self.parent[i-1]
[docs] def right_sibling(self): """Return the right sibling of this node, if any. Does not descend in child nodes or ascend upto the parent. Fails if called on the root node. """ if self.parent[-1] is not self: i = self.parent_index() return self.parent[i+1]
[docs] def left_siblings(self): """Yield the left siblings of this node in reverse order, if any. Does not descend in child nodes or ascend upto the parent. Fails if called on the root node. """ if self.parent[0] is not self: i = self.parent_index() yield from self.parent[i-1::-1]
[docs] def right_siblings(self): """Yield the right siblings of this node, if any. Does not descend in child nodes or ascend upto the parent. Fails if called on the root node. """ if self.parent[-1] is not self: i = self.parent_index() yield from self.parent[i+1:]
[docs] def next_token(self): """Return the following Token, if any.""" for t in self.forward(): return t
[docs] def previous_token(self): """Return the preceding Token, if any.""" for t in self.backward(): return t
[docs] def forward(self, upto=None): """Yield all Tokens in forward direction, starting at the right sibling. Descends into child Contexts, and ascends into parent Contexts. If upto is given, does not ascend above that context. """ for parent, index in self.ancestors_with_index(upto): yield from util.tokens(parent[index+1:])
[docs] def backward(self, upto=None): """Yield all Tokens in backward direction, starting at the left sibling. Descends into child Contexts, and ascends into parent Contexts. If upto is given, does not ascend above that context. """ for parent, index in self.ancestors_with_index(upto): if index: yield from util.tokens(parent[:index], True)
@property def query(self): """Query this node in different ways; see the :mod:`~parce.query` module.""" def gen(): yield self return query.Query(gen)
[docs] def delete(self): """Remove this node from its parent. If the parent would become empty, it is removed too. Returns the first non-empty ancestor. """ for parent, index in self.ancestors_with_index(): del parent[index] if len(parent): return parent
[docs]class Token(Node): """A Token instance represents a lexed piece of text. When a pattern rule in a lexicon matches the text, a Token is created. When that rule would create more than one Token from a single regular expression match, GroupToken objects are created instead, carrying the index of the token in the group in the `group` attribute. The `group` attribute is readonly None for normal tokens. GroupTokens are thus always adjacent in the same context. If you want to retokenize text starting at some position, be sure you are at the start of a grouped token, e.g.:: t = ctx.find_token(45) if t.group: for t in t.left_siblings(): if not t.group: break pos = t.pos Alternatively, you can use the `GroupToken.get_group_*` methods. (A GroupToken is just a normal Token otherwise, the reason a subclass was created is that the group attribute is unused in by far the most tokens, so it does not use any memory. You never need to reference the GroupToken class; just test the group attribute if you want to know if a token belongs to a group that originated from a single match.) When iterating over the children of a Context (which may be Context or Token instances), you can use the `is_token` attribute to determine whether the node child is a token, which is easier than to call `isinstance(t, Token)` each time. From a token, you can iterate `forward()` or `backward()` to find adjacent tokens. If you only want to stay in the current context, use the various sibling methods, such as `right_sibling()`. By traversing the `ancestors()` of a token or context, you can find which lexicons created the tokens. You can compare a Token instance with a string. Instead of:: if token.text == "bla": do_something() you can do:: if token == "bla": do_something() You can call `len()` on a token, which returns the length of the token's text attribute, and you can use the string format method to embed the token's text in another string:: s = "blabla {}".format(token) A token always has a parent, and that parent is always a Context instance. """ __slots__ = "_parent", "pos", "text", "action" is_token = True #: Always True for Token def __init__(self, parent, pos, text, action): self.parent = parent #: The Context node to which the token was added self.pos = pos #: The position in the original text self.text = text #: The text of this token self.action = action #: The action specified by the lexicon rule that created the token @property def end(self): """The end position of this token in the original text.""" return self.pos + len(self.text) group = None #: Always None for Token, an integer for :class:`GroupToken`
[docs] def copy(self, parent=None): """Return a copy of the Token, but with the specified parent.""" return type(self)(parent, self.pos, self.text, self.action)
[docs] def equals(self, other): """Return True if the other Token has the same ``text`` and ``action`` attributes and the same context ancestry (see also :meth:`state_matches`). Note that the ``pos`` attribute is not compared. """ return (self.text == other.text and self.action == other.action and self.state_matches(other))
[docs] def state_matches(self, other): """Return True if the other Token has the same lexicons in the ancestors.""" if other is self: return True for c1, c2 in zip(self.ancestors(), other.ancestors()): if c1 is c2: return True elif c1.lexicon is not c2.lexicon: return False return c1.parent is None and c2.parent is None
def __repr__(self): text = reprlib.repr(self.text) return "<Token {} at {}:{} ({})>".format(text, self.pos, self.end, self.action) def __hash__(self): return Node.__hash__(self) def __eq__(self, other): if isinstance(other, str): return other == self.text return other is self def __ne__(self, other): if isinstance(other, str): return other != self.text return other is not self def __format__(self, formatstr): return self.text.__format__(formatstr) def __len__(self): return len(self.text)
[docs] def forward_including(self, upto=None): """Yield all tokens in forward direction, including self.""" yield self yield from self.forward(upto)
[docs] def backward_including(self, upto=None): """Yield all tokens in backward direction, including self.""" yield self yield from self.backward(upto)
[docs] def forward_until_including(self, other): """Yield all tokens starting with us and upto and including the other.""" r = self.range(other) if r: yield from r.tokens()
[docs] def common_ancestor_with_trail(self, other): """Return a three-tuple(context, trail_self, trail_other). The context is the common ancestor such as returned by common_ancestor, if any. trail_self is a tuple of indices from the common ancestor upto self, and trail_other is a tuple of indices from the same ancestor upto the other Token. If there is no common ancestor, all three are None. But normally, all nodes share the root context, so that will normally be the upmost common ancestor. """ if other is self: i = self.parent_index() return self.parent, (i,), (i,) if other.pos > self.pos: s_ancestors, s_indices = zip(*self.ancestors_with_index()) o_indices = [] for n, i in other.ancestors_with_index(): o_indices.append(i) try: s_i = s_ancestors.index(n) except ValueError: continue return n, s_indices[s_i::-1], o_indices[::-1] return None, None, None
[docs] def range(self, other): """Return a :class:`Range` from this token upto and including the other. Returns None if the other :class:`Token` does not belong to the same tree. """ context, start_trail, end_trail = self.common_ancestor_with_trail(other) if context: return Range(context, start_trail, end_trail)
[docs]class GroupToken(Token): """A Token class that allows setting the `group` attribute. For normal Token instances, `group` is a class attribute that is always None. For Tokens that belong to a group, i.e. originated from a single regular expression match, the `group` attribute is the index of the token in the group of tokens that were created together. The last token in the group has a negative value, so it can be recognized as the last. For example, tokens of a three-group have the indices 0, 1 and -2. The methods :meth:`get_group`, :meth:`get_group_start` and :meth:`get_group_end` can only be reliably used when there are no tokens deleted from the tree, and when the tokens really have a parent. """ __slots__ = "group", def __init__(self, group, parent, pos, text, action): self.group = group #: The index of this token in a group (negated for the last token in a group) super().__init__(parent, pos, text, action)
[docs] def copy(self, parent=None): """Return a copy of the Token, but with the specified parent.""" return type(self)(self.group, parent, self.pos, self.text, self.action)
[docs] @classmethod def make_group(cls, parent, lexemes): """Create a tuple of GroupTokens for the lexemes.""" group = tuple(cls(n, parent, *t) for n, t in enumerate(lexemes)) group[-1].group *= -1 return group
[docs] def get_group(self): """Return the whole group this token belongs to as a list.""" p = self.parent i = j = self.parent_index() z = len(p) - 1 if self.group < 0: # we are at the last i += self.group else: i -= self.group j += 1 while j < z and p[j].group > 0: j += 1 return p[i:j+1]
[docs] def get_group_start(self): """Return the first token of the group this token belongs to.""" i = self.parent_index() if self.group < 0: i += self.group else: i -= self.group return self.parent[i]
[docs] def get_group_end(self): """Return the last token of the group this token belongs to.""" p = self.parent i = self.parent_index() z = len(p) - 1 if self.group >= 0: i += 1 while i < z and p[i].group > 0: i += 1 return p[i]
[docs]class Context(list, Node): """A Context represents a list of tokens and contexts. The lexicon that created the tokens is in the `lexicon` attribute. If a pattern rule jumps to another lexicon, a sub-Context is created and tokens are added there. If that lexicon pops back to the current one, new tokens can appear after the sub-context. (So the token that caused the jump to the sub-context normally preceeds the context it created.) A context has a `parent` attribute, which can point to an enclosing context. The root context has `parent` None. When iterating over the children of a Context (which may be Context or Token instances), you can use the `is_context` attribute to determine whether the node child is a context, which is easier than to call `isinstance(node, Context)` each time. You can quickly find tokens in a context, based on text:: if "bla" in context: # etc Or child contexts, based on lexicon:: if MyLanguage.lexicon in context: # etc And if you want to know which token is on a certain position in the text, use e.g.:: context.find_token(45) which, using a bisection algorithm, quickly returns the token, which might be in any sub-context of the current context. """ __slots__ = "lexicon", "_parent" is_context = True #: Always True for Context def __new__(cls, lexicon, parent): return list.__new__(cls) def __init__(self, lexicon, parent): self.lexicon = lexicon #: The lexicon this context was instantiated with. self.parent = parent def __repr__(self): pos, end = self.pos, self.end if pos == end: pos = end = "?" # both are 0 in this case: empty Context name = self.lexicon and repr(self.lexicon) children = "child" if len(self) == 1 else "children" return "<Context {} at {}-{} ({} {})>".format( name, pos, end, len(self), children) def __hash__(self): return Node.__hash__(self) def __eq__(self, other): if isinstance(other, Lexicon): return self.lexicon == other return other is self def __ne__(self, other): if isinstance(other, Lexicon): return self.lexicon != other return other is not self @property def ls(self): """List the contents of this Context, for debugging purposes.""" for i, n in enumerate(self): print("[{}] {}".format(i, repr(n)))
[docs] def copy(self, parent=None): """Return a copy of the context, but with the specified parent.""" # a non-recursive implementation due to Python's recursion limits copy = copy_root = type(self)(self.lexicon, parent) n = self i = 0 while True: z = len(n) while i < z: m = n[i] if m.is_context: copy.append(type(m)(m.lexicon, copy)) copy = copy[-1] i = 0 n = m break else: copy.append(m.copy(copy)) i += 1 else: if copy is copy_root: break n = n.parent copy = copy.parent i = len(copy) return copy_root
@property def pos(self): """Return the position or our first token. Returns 0 if empty.""" try: node = self[0] while node.is_context: node = node[0] return node.pos except IndexError: return 0 @property def end(self): """Return the end position or our last token. Returns 0 if empty.""" try: node = self[-1] while node.is_context: node = node[-1] return node.end except IndexError: return 0
[docs] def is_root(self): """Return True if this Context has no parent node.""" return self.parent is None
[docs] def height(self): """Return the height of the tree (the longest distance to a descendant).""" if not self: return 0 stack = [] height = 0 i = 0 n = self while True: for i in range(i, len(n)): m = n[i] if m.is_context: stack.append(i) height = max(height, len(stack)) i = 0 n = m break else: if stack: n = n.parent i = stack.pop() + 1 else: return height + 1
[docs] def tokens(self, reverse=False): """Yield all Tokens, descending into nested Contexts. If ``reverse`` is set to True, yield all tokens in backward direction. """ children = reversed if reverse else iter stack = [] gen = children(self) while True: for n in gen: if n.is_token: yield n else: stack.append(gen) gen = children(n) break else: if stack: gen = stack.pop() else: break
[docs] def first_token(self): """Return our first Token.""" try: node = self[0] while node.is_context: node = node[0] return node except IndexError: pass
[docs] def last_token(self): """Return our last token.""" try: node = self[-1] while node.is_context: node = node[-1] return node except IndexError: pass
[docs] def find(self, pos): """Return the index of our child at (or to the right of) pos. Returns -1 if there is no such child. """ i = 0 hi = l = len(self) while i < hi: mid = (i + hi) // 2 n = self[mid] if n.end <= pos: i = mid + 1 else: hi = mid return -1 if i == l else i
[docs] def find_context(self, pos): """Return the younghest Context at position (or self).""" node = self i = self.find(pos) if i != -1: n = node[i] while n.is_context and n.pos <= pos: node = n n = n[n.find(pos)] return node
[docs] def find_token(self, pos): """Return the Token at or to the right of position. Returns None if there is no such token. """ i = self.find(pos) if i != -1: n = self[i] while n.is_context: n = n[n.find(pos)] return n
[docs] def find_token_with_trail(self, pos): """Return the Token at or to the right of position, and the trail of indices. The trail is the list of indices where the token was found. Returns (None, None) if there is no such token. Here is an example:: >>> import parce >>> tree = parce.root(parce.find('css'), open('parce/themes/default.css').read()) >>> tree.find_token_with_trail(600) (<Token ' Selected te...ow has focus ' at 566:607 (Comment)>, [21, 0]) >>> tree[21][0] <Token ' Selected te...ow has focus ' at 566:607 (Comment)> """ i = self.find(pos) if i != -1: n = self[i] trail = [i] while n.is_context: i = n.find(pos) n = n[i] trail.append(i) return n, trail return None, None
[docs] def find_left(self, pos): """Return the index of our child at or to the left of pos. Returns -1 if there is no such child. """ i = 0 hi = len(self) while i < hi: mid = (i + hi) // 2 n = self[mid] if n.pos < pos: i = mid + 1 else: hi = mid return i - 1
[docs] def find_token_left(self, pos): """Return the Token at or to the left of position. Returns None if there is no such token. """ i = self.find_left(pos) if i != -1: n = self[i] while n.is_context: n = n[n.find_left(pos)] return n
[docs] def find_token_left_with_trail(self, pos): """Return the Token at or to the left of position, and the trail of indices. Returns (None, None) if there is no such token. """ i = self.find_left(pos) if i != -1: n = self[i] trail = [i] while n.is_context: i = n.find_left(pos) n = n[i] trail.append(i) return n, trail return None, None
[docs] def find_token_after(self, pos): """Return the first token completely right from pos. Returns None if there is no token right from pos. """ node = self while True: i = 0 hi = l = len(node) while i < hi: mid = (i + hi) // 2 n = node[mid] if n.is_context: n = n.last_token() if n.pos < pos: i = mid + 1 else: hi = mid if i >= l: return node = node[i] if node.is_token: return node
[docs] def find_token_before(self, pos): """Return the last token completely left from pos. Returns None if there is no token left from pos. """ node = self while True: i = 0 hi = len(node) while i < hi: mid = (i + hi) // 2 n = node[mid] if n.is_context: n = n.first_token() if pos < n.end: hi = mid else: i = mid + 1 if i == 0: return node = node[i-1] if node.is_token: return node
[docs] def range(self, start=0, end=None): """Return a :class:`Range`. The ancestor of the range is the common ancestor of the tokens found at start and end (or the context itself if start or end fall outside this context). If start is 0 and end is None, the range encompasses the full context. Returns None if this context is empty. """ return Range.from_tree(self, start, end)
[docs]class Range: """A Range denotes a range of a tree structure. A range is defined by an ancestor context and possibly empty lists pointing to the start and end token, if specified. If both trails are not specified, the range encompasses the full context. """ def __init__(self, ancestor, start_trail=None, end_trail=None): self.ancestor = ancestor #: The specified ancestor self.start_trail = start_trail or [] #: The specified start trail (empty list by default) self.end_trail = end_trail or [] #: The specified end trail (empty list by default) def __repr__(self): return "<{} {} [{}:{}]>".format(type(self).__name__, self.ancestor.lexicon, self.pos, self.end) @property def pos(self): """The position of the first token in our range.""" n = self.ancestor for i in self.start_trail: n = n[i] return n.pos @property def end(self): """The end position of the last token in our range.""" n = self.ancestor for i in self.end_trail: n = n[i] return n.end
[docs] @classmethod def from_tree(cls, tree, start=0, end=None): """Create a Range. The ancestor is the common ancestor of the tokens found at start and end (or the tree itself if start or end fall outside the range of the tree). If start is 0 and end is None, the range encompasses the full tree. Returns None if the tree is empty. """ if not tree: return # empty context = tree if end is not None and end < tree.end: if end <= start: return end_trail = tree.find_token_left_with_trail(end)[1] if not end_trail: return else: end_trail = [] if start > 0: start_trail = tree.find_token_with_trail(start)[1] if not start_trail: return if end_trail: # find the youngest common ancestor for n, (i, j) in enumerate(zip(start_trail, end_trail)): if i != j or context[i].is_token: break context = context[i] if n: del start_trail[:n] del end_trail[:n] else: start_trail = [] return cls(context, start_trail, end_trail)
[docs] def slices(self, target_factory=None): """Yield (context, slice) tuples. The yielded slices include the tokens at the end of start and end trail. If you specify a ``target_factory``, it should be a :class:`~.target.TargetFactory` object, and it will be updated along with the yielded slices. """ if self.start_trail: start = self.start_trail[0] if len(self.start_trail) > 1: ancestors = [] n = self.ancestor[start] for i in self.start_trail[1:]: ancestors.append((n, i)) n = n[i] yield ancestors[-1][0], slice(i, None) # include start token for p, i in ancestors[-2::-1]: target_factory and target_factory.pop() yield p, slice(i + 1, None) target_factory and target_factory.pop() start += 1 else: start = 0 if self.end_trail: end = self.end_trail[0] if len(self.end_trail) == 1: yield self.ancestor, slice(start, end + 1) # include end token else: yield self.ancestor, slice(start, end) n = self.ancestor[end] for end in self.end_trail[1:-1]: target_factory and target_factory.push(n.lexicon) yield n, slice(end) n = n[end] target_factory and target_factory.push(n.lexicon) yield n, slice(self.end_trail[-1] + 1) # include end token else: yield self.ancestor, slice(start, None)
[docs] def tokens(self): """Yield all tokens in this range. The first and last tokens may overlap with the start and end positions. """ for context, slice_ in self.slices(): yield from util.tokens(context[slice_])
[docs]def make_tokens(lexemes, parent=None): """Factory returning a tuple of one or more :class:`Token` instances for the lexemes. The ``lexemes`` argument is an iterable of three-tuples like the ``lexemes`` in an :class:`~parce.lexer.Event` namedtuple defined in the :mod:`~parce.lexer` module. If there is more than one lexeme, :class:`GroupToken` instances are created. The specified ``parent`` context is set as parent, if given. """ if len(lexemes) > 1: return GroupToken.make_group(parent, lexemes) else: return Token(parent, *lexemes[0]),