Source code for parce.treebuilderutil

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


"""
Helper functions and classes for the :mod:`~parce.treebuilder` module.

"""


import collections
import itertools

from . import util
from .lexer import Event, Lexer
from .tree import Context, Range
from .target import TargetFactory


#: encapsulates the return values of :meth:`TreeBuilder.build_new_tree`
BuildResult = collections.namedtuple("BuildResult", "tree start end offset lexicons")

#: encapsulates the return values of :meth:`TreeBuilder.replace_tree`
ReplaceResult = collections.namedtuple("ReplaceResult", "start end lexicons")


[docs]class Changes: """Store changes that have to be made to a tree. This object is used by :meth:`~parce.treebuilder.TreeBuilder.get_changes()`. Calling :meth:`add()` merges new changes with the existing changes. """ __slots__ = "text", "root_lexicon", "start", "removed", "added" def __init__(self): self.text = "" self.root_lexicon = False # meaning no change is requested self.start = -1 # meaning no text is altered self.removed = 0 self.added = 0 def __repr__(self): changes = [] if self.root_lexicon != False: changes.append("root_lexicon: {}".format(self.root_lexicon)) if self.start != -1: changes.append("text: {} -{} +{}".format(self.start, self.removed, self.added)) if not changes: changes.append("(no changes)") return "<Changes {}>".format(', '.join(changes))
[docs] def add(self, text, root_lexicon=False, start=0, removed=None, added=None): """Merge new change with existing changes. If added and removed are not given, all text after start is considered to be replaced. """ if root_lexicon != False: self.root_lexicon = root_lexicon if removed is None: removed = len(self.text) - start if added is None: added = len(text) - start self.text = text if self.start == -1: # there were no previous changes self.start = start self.removed = removed self.added = added return # determine the offset for removed and added if start + removed < self.start: offset = self.start - start - removed elif start > self.start + self.added: offset = start - self.start - self.added else: offset = 0 # determine which part of removed falls inside existing changes start = max(start, self.start) end = min(start + removed, self.start + self.added) offset -= max(0, end - start) # set the new values self.start = min(self.start, start) self.removed += removed + offset self.added += added + offset
[docs] def has_changes(self): """Return True when there are actually changes.""" return self.start != -1 or self.root_lexicon != False
[docs] def new_position(self, pos): """Return how the current changes would affect an older start.""" if pos < self.start: return pos elif pos < self.start + self.removed: return self.start + self.added return pos - self.removed + self.added
[docs]def get_prepared_lexer(tree, text, start, new_tree=False): """Get a prepared lexer reading from text, positioned at (or before) start. Returns the three-tuple (lexer, events, tokens). The events stream is returned seperately because the last Event can be pushed back, so it is yielded again. The tokens are the last tokens group that remained the same. Returns None when no position to start can be found, just start from the beginning in this case. If new_tree is True, does not find a start position if there would no tokens remain left of it. This is useful when restarting a tree build; it avoids leaving empty contexts in the build tree that should not be there. """ last_token = start_token = find_token_before(tree, start) if not last_token: return go_back = backward(last_token) if last_token.group is not None and last_token.group >= 0: # we are in the middle of a group for last_token in go_back: if last_token.group is None or last_token.group < 0: break else: return while start: # go back at least 10 tokens, to the beginning of a group; and don't # stop at the first token(group) of a context whose lexicon has # consume == True start = 0 count = 10 for start_token in go_back: for next_token in go_back: if start_token.group is None and not (start_token.is_first() and start_token.parent.lexicon.consume): count -= 1 if count == 0: start = start_token.pos break start_token = next_token break if start: lexer = get_lexer(start_token) elif new_tree: return else: lexer = Lexer([tree.lexicon]) events = lexer.events(text, start) # compare the new events with the old tokens; at least one # should be the same; if not, go back further if possible old_events = events_with_tokens(start_token, last_token) prev = None for (old, tokens), new in zip(old_events, events): if not same_events(old, new): if prev: return lexer, itertools.chain((new,), events), prev break prev = tokens else: if prev: return lexer, events, prev
[docs]def events_with_tokens(start_token, last_token): r"""Yield (Event, tokens) tuples for start_token until and including last_token. Events are yielded together with token groups (or single tokens in a 1-length tuple). This function is used by :func:`get_prepared_lexer` to compare an existing token structure with events originating from a lexer. The start_token must be the first of a group, if it is a GroupToken. Additionally, it should not be the first token in a context whose lexicon has the ``consume`` flag set. But if the start_token is the very first token in a tree, it does not matter if it is not in the root context, this case is handled gracefully. """ context, start_trail, end_trail = common_ancestor_with_trail(start_token, last_token) if context: r = Range(context, start_trail, end_trail) target = TargetFactory() get, push, pop = target.get, target.push, target.pop def events(nodes): stack = [] i = 0 n = nodes while True: z = len(n) while i < z: m = n[i] if m.is_context: push(m.lexicon) stack.append(i) i = 0 n = m break else: group = m, if m.group is not None: # find the last token in this group j = i + 1 while j < z and n[j].group > 0: j += 1 group = n[i:j+1] lexemes = tuple((t.pos, t.text, t.action) for t in group) yield Event(get(), lexemes), group i += len(group) else: if stack: pop() i = stack.pop() + 1 n = n.parent if stack else nodes # slice we started with else: break if start_token.is_first() and not start_token.parent.is_root() \ and not any(backward(start_token)): # start token is the very first token, but it is not in the root # context. So it is a child of a lexicon with consume, or a context # that was jumped to via a default target. Build a target from root. lexicons = [p.lexicon for p in start_token.ancestors()] push(*lexicons[-2::-1]) # reversed, not root for context, slice_ in r.slices(target): yield from events(context[slice_])
[docs]def get_lexer(token): """Get a Lexer initialized at the token's ancestry.""" lexicons = [p.lexicon for p in token.ancestors()] lexicons.reverse() return Lexer(lexicons)
[docs]def new_tree(token): """Return an empty context (and its root) with the same ancestry as the token's.""" c = n = context = Context(token.parent.lexicon, None) for p in token.parent.ancestors(): n = Context(p.lexicon, None) c.parent = n n.append(c) c = n return context, n
[docs]def find_token_before(node, pos): """A version of :meth:`Context.find_token_before() <parce.tree.Context.find_token_before>` that can handle empty contexts. The new tree built inside :meth:`TreeBuilder.build_new_tree() <parce.treebuilder.TreeBuilder.build_new_tree>` can have an empty context at the beginning and/or the end. Returns None if there is no token left from pos. """ while True: i = 0 hi = len(node) while i < hi: mid = (i + hi) // 2 n = node[mid] if n.is_context: n = n.first_token() or n # if no first token, just n itself if pos < n.end: hi = mid else: i = mid + 1 if i == 0: return node = node[i-1] if node.is_token: return node
[docs]def ancestors_with_index(node): """A version of :meth:`Node.ancestors_with_index() <parce.tree.Node.ancestors_with_index>` that can handle empty contexts. """ while node.parent: index = node.parent.index(node) node = node.parent yield node, index
[docs]def backward(node): """A version of :meth:`Node.backward() <parce.tree.Node.backward>` that can handle empty contexts. """ for node, index in ancestors_with_index(node): if index: yield from util.tokens(node[:index], True)
[docs]def common_ancestor_with_trail(node, other): """A version of :meth:`Token.common_ancestor_with_trail() <parce.tree.Token.common_ancestor_with_trail>` that can handle empty contexts. """ if other is node: i = node.parent.index(node) return node.parent, (i,), (i,) if other.pos > node.pos: s_ancestors, s_indices = zip(*ancestors_with_index(node)) o_indices = [] for n, i in ancestors_with_index(other): o_indices.append(i) try: s_i = s_ancestors.index(n) except ValueError: continue return n, s_indices[s_i::-1], o_indices[::-1] return None, None, None
[docs]def same_events(e1, e2): """Compare Event tuples in a robust way. Returns True if the events are completely the same. The lexicon in a target is compared on identity instead of equality. This prevents different derived lexicons from comparing the same, which is needed when rebuilding a tree. """ if e1 != e2: return False elif e1.target is None or not e1.target.push: return True # both targets compare equal, and have a non-empty push value that compares # equal, so now we only need to compare the lexicons on identity. return all(l1 is l2 for l1, l2 in zip(e1.target.push, e2.target.push))