Source code for parce.ruleitem

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


"""
This module contains the implementation of the replacable rule item objects,
and some functions to query and manipulate rules that are used by the Lexicon.

Normally, you don't need this module directly; use the :mod:`~parce.rule`
module for your language definitions.

"""

import operator

from . import util


# pre_evaluate flags
_CHANGED = 0
_COMPLETE = 1
_UNCHANGED = 2


class _EvaluationError(RuntimeError):
    """Raised when an attribute is missing in the evaluation namespace."""
    pass


[docs]class Item:
    """Base class for any replacable rule item.

    An Item is considered to be immutable; you should never alter the
    attributes after instantiation. When an Item can be partly pre-evaluated a
    copy must be returned by calling ``type(self)(*args)``.

    In some cases you can also return a different Item type when pre-evaluating
    partly succeeds. For example, the :class:`select` type simply returns the
    chosen item if the index already can be evaluated, without evaluating the
    other items.

    """
    __slots__ = ()
    _getitem_func = operator.getitem

    def __getitem__(self, n):
        """Return a new Item that performs item[n]. n is evaluated as well."""
        return call(self._getitem_func, self, n)

[docs]    def evaluate(self, ns):
        """Evaluate item in namespace dict ``ns``."""
        raise NotImplementedError

[docs]    def pre_evaluate(self, ns):
        """Try to evaluate item in namespace dict ``ns``.

        Return a two-tuple(obj, success).

        Success is a two-bit value indicating whether the result is completely
        evaluated and whether something has changed. Bit 0 is set when the item
        is completely evaluated, and bit 1 is set when there was no
        modification. So all possible return values are:

        | 0: the object has changed but it is not yet completely evaluated
        | 1: the object has changed and now it is fully evaluated
        | 2: the object needs evaluation but is not changed

        Specific items may reimplement this method to return partially
        evaluated items.

        """
        try:
            return self.evaluate(ns), _COMPLETE
        except _EvaluationError:
            return self, _UNCHANGED

[docs]    def variations(self):
        """Yield the possible results for this item.

        This is used to build a decision tree for a rule, to see which actions
        and targets it could bring.

        The default implementation raises a RuntimeError; only RuleItem
        objects can yield variations.

        """
        raise RuntimeError("Item '{}' can't be used directly in a rule".format(repr(self)))

    def __repr__(self):
        return "{}({})".format(
            self.__class__.__name__,
            ', '.join(map(repr, self._repr_args())))

    def _repr_args(self):
        return ()


[docs]class VariableItem(Item):
    """A named variable that's accessed in the namespace.

    If a getitem_func is given, it is called when this item is accessed using
    the item syntax ``[n]``, with this item as first argument, and then the
    specified index.

    """
    __slots__ = ('_name', '_getitem_func')
    def __init__(self, name, getitem_func=operator.getitem):
        self._name = name
        self._getitem_func = getitem_func

[docs]    def evaluate(self, ns):
        """Get the variable from the namespace dict."""
        try:
            return ns[self._name]
        except KeyError as e:
            raise _EvaluationError("Can't find variable '{}'".format(self._name)) from e

    def __repr__(self):
        return self._name.upper()


[docs]class call(Item):
    """Call predicate with arguments."""
    __slots__ = ('_predicate', '_arguments')
    def __init__(self, predicate, *arguments):
        self._predicate = predicate
        self._arguments = arguments

[docs]    def evaluate(self, ns):
        """Call predicate with the arguments."""
        predicate = evaluate(self._predicate, ns)
        arguments = evaluate(self._arguments, ns)
        return predicate(*arguments)

[docs]    def pre_evaluate(self, ns):
        """Optimize by pre-evaluating what can be pre-evaluated."""
        predicate, pred_ok = pre_evaluate(self._predicate, ns)
        arguments, arg_ok  = pre_evaluate(self._arguments, ns)
        ok = pred_ok & arg_ok
        if ok & _COMPLETE:
            return predicate(*arguments), _COMPLETE
        if ok & _UNCHANGED:
            return self, _UNCHANGED
        return type(self)(predicate, *arguments), _CHANGED

    def _repr_args(self):
        return (self._predicate, *self._arguments)


[docs]class RuleItem(Item):
    """Classes inheriting RuleItem are allowed in toplevel in rules.

    They are evaluated by the lexicon when a rule matches.

    """
    __slots__ = ()


[docs]class select(RuleItem):
    """Chooses one of the items.

    If an item is a list, it is unrolled when replacing the item in a rule.

    """
    __slots__ = ('_index', '_items')

    def __init__(self, index, *items):
        self._index = index
        self._items = items

[docs]    def evaluate(self, ns):
        """Return items[index]."""
        index = evaluate(self._index, ns)
        item = evaluate(self._items[index], ns)
        return item

[docs]    def pre_evaluate(self, ns):
        """Optimize by pre-evaluating what can be pre-evaluated."""
        index, ok = pre_evaluate(self._index, ns)
        if ok & _COMPLETE:
            item, ok = pre_evaluate(self._items[index], ns)
            return item, ok & _COMPLETE     # mask unchanged state
        items, items_ok = pre_evaluate(self._items, ns)
        ok &= items_ok
        if ok & _UNCHANGED:
            return self, _UNCHANGED
        return type(self)(index, *items), _CHANGED

[docs]    def variations(self):
        """Yield all the items that could be chosen (unevaluated)."""
        yield from self._items

    def _repr_args(self):
        return (self._index, *self._items)


[docs]class target(RuleItem):
    """target(value, *lexicons)

    Has a special handling: if the value is an integer, it is used as the
    result value (to push/pop contexts).

    If it is a two-tuple(index, argument): The index points to the lexicon and
    the argument is used as lexicon argument.

    """
    __slots__ = ('_value', '_lexicons')

    def __init__(self, value, *lexicons):
        self._value = value
        self._lexicons = lexicons

[docs]    def evaluate(self, ns):
        """Return value if integer, otherwise lexicons[value[0]](value[1])."""
        value = evaluate(self._value, ns)
        if isinstance(value, int):
            return value
        index, arg = value
        lexicon = evaluate(self._lexicons[index], ns)
        return lexicon if arg is None else lexicon(arg)

[docs]    def pre_evaluate(self, ns):
        """Optimize by pre-evaluating what can be pre-evaluated."""
        value, ok = pre_evaluate(self._value, ns)
        if ok & _COMPLETE:
            if isinstance(value, int):
                return value, _COMPLETE
            index, arg = value
            lexicon, ok = pre_evaluate(self._lexicons[index], ns)
            if ok & _COMPLETE:
                return (lexicon if arg is None else lexicon(arg), _COMPLETE)
            return type(self)((0, arg), lexicon), _CHANGED
        # pre-evaluate the lexicons
        lexicons, l_ok = pre_evaluate(self._lexicons, ns)
        ok &= l_ok
        if ok & _UNCHANGED:
            return self, ok
        return type(self)(value, *lexicons), ok

[docs]    def variations(self):
        """Yield our possible variations.

        If the value is evaluated, yield either the value or the chosen
        lexicon. If not, yields ``a_number`` and all lexicons items.

        """
        value = self._value
        if isinstance(value, Item):
            yield a_number
            yield from self._lexicons
        elif isinstance(value, int):
            yield value
        else:
            index, arg = value
            yield self._lexicons[index]

    def _repr_args(self):
        return (self._value, *self._lexicons)


[docs]class PostponedItem(RuleItem):
    """Mixin base class for items that keep alive after the Lexicon.

    When inheriting from this class, implement the :meth:`evaluate_items`
    method, which lists all values as they were given to the __init__ method.

    If this method returns values, those are evaluated, and a new PostponedItem
    is returned with the contents evaluated.

    """
    __slots__ = ()

[docs]    def evaluate(self, ns):
        """Evaluate all values returned by the evaluate_items() method.

        If any value changes, a copy of the Item is returned, otherwise the
        Item ifself. If the evaluate_items() method does not yield any value,
        this Item is always returned unchanged.

        """
        items, ok = pre_evaluate(self.evaluate_items(), ns)
        return self if ok & _UNCHANGED else type(self)(*items)

[docs]    def pre_evaluate(self, ns):
        """Pre-evaluate all values returned by the evaluate_items() method.

        If any value changes, a copy of the Item is returned, otherwise the
        Item ifself.

        """
        items, ok = pre_evaluate(self.evaluate_items(), ns)
        return self if ok & _UNCHANGED else type(self)(*items), ok

[docs]    def evaluate_items(self):
        """Return a tuple of the values as given to the __init__ method,
        when they need to be evaluated inside this PostponedItem.

        This method should either yield *all* values that were given to the
        __init__ method, or nothing. The default implementation yields nothing,
        so nothing is evaluated or pre-evaluated.

        """
        return ()


[docs]class pattern(PostponedItem):
    """Represents a pattern.

    This evaluates its value, but remains alive after building the rule.

    """
    __slots__ = ('_value',)

    def __init__(self, value):
        self._value = value

    @property
    def value(self):
        """Get the pattern value."""
        return self._value

[docs]    def evaluate_items(self):
        """Yield the pattern value."""
        return self._value,

[docs]    def variations(self):
        """If the value is evaluated, yield it, otherwise yields ``None`` and ``a_string``."""
        if isinstance(self._value, Item):
            yield None
            yield a_string
        else:
            yield self._value

    def _repr_args(self):
        return self._value,


[docs]class ActionItem(PostponedItem):
    """Mixin base class for dynamic actions."""
    __slots__ = ()


[docs]class SubgroupAction(ActionItem):
    """Yield actions from subgroups in a match.

    A SubgroupAction looks at subgroups in the regular expression match and
    returns the same amount of tokens as there are subgroups, using the specified
    action for every subgroup.

    For example, the rule::

        "(0x)([0-9a-f]+)", SubgroupAction(Number.Prefix, Number.Hexadecimal)

    yields two tokens in case of a match, one for "0x" and the other for the
    other group of the match.

    There should be the same number of subgroups in the regular expression as
    there are action attributes given to __init__().

    """
    __slots__ = ('_actions',)
    def __init__(self, *actions):
        self._actions = actions

[docs]    def replace(self, lexer, pos, text, match):
        for i, action in enumerate(self._actions, match.lastindex + 1):
            yield from lexer.filter_actions(action, match.start(i), match.group(i), match)

[docs]    def variations(self):
        """Yield the possible actions."""
        yield from self._actions

    def _repr_args(self):
        return self._actions

[docs]    def pre_evaluate(self, ns):
        """Reimplemented to only pre-evaluate subgroup actions (evaluating happens in the lexer)."""
        items, ok = pre_evaluate(self._actions, ns)
        return self if ok & _UNCHANGED else type(self)(*items), ok


[docs]class DelegateAction(ActionItem):
    """This action uses a lexicon to parse the text.

    All tokens are yielded as one group, flattened, ignoring the tree
    structure, so this is not efficient for large portions of text, as the
    whole region is parsed again on every modification.

    But it can be useful when you want to match a not too large text blob first
    that's difficult to capture otherwise, and then lex it with a lexicon that
    does (almost) not enter other lexicons.

    """
    __slots__ = ('_lexicon',)

    def __init__(self, lexicon):
        self._lexicon = lexicon

[docs]    def replace(self, lexer, pos, text, match):
        """Use our lexicon to parse the matched text."""
        sublexer = type(lexer)([self._lexicon])
        for e in sublexer.events(text):
            for p, txt, action in e.lexemes:
                yield pos + p, txt, action

[docs]    def evaluate_items(self):
        """Return the lexicon specified on init, used by evaluate() and pre_evaluate()."""
        return self._lexicon,

[docs]    def variations(self):
        """Yield our lexicon."""
        yield self._lexicon

    def _repr_args(self):
        return self._lexicon,


[docs]class SkipAction(ActionItem):
    """A DynamicAction that yields nothing.

    A SkipAction() is stored in the module variable ``skip`` and causes the rule
    to silently ignore the matched text.

    """
[docs]    def replace(self, lexer, pos, text, match):
        yield from ()

[docs]    def variations(self):
        """Yield no variations."""
        return
        yield


[docs]def evaluate(obj, ns):
    """Evaluate an object, that may or may not be an Item.

    The namespace `ns` is a dictionary containing text, match and/or arg
    variables. A list or a tuple of items is also evaluated and always becomes
    a tuple.

    """
    if isinstance(obj, Item):
        return obj.evaluate(ns)
    if type(obj) in (list, tuple):
        return tuple(evaluate(o, ns) for o in obj)
    return obj


[docs]def evaluate_rule(rule, match):
    """Evaluate all RuleItem objects in the rule.

    The specified match object provides the value for the TEXT and MATCH
    variables. Lists and tuples are unrolled.

    """
    ns = {'text': match.group(), 'match': match}
    def eval_rule_items(objs):
        for obj in objs:
            if isinstance(obj, RuleItem):
                yield from util.unroll(obj.evaluate(ns))
            elif type(obj) in (list, tuple):
                yield from eval_rule_items(obj)
            else:
                yield obj
    yield from eval_rule_items(rule)


[docs]def pre_evaluate(obj, ns):
    """Pre-evaluate any object, that may or may not be an Item.

    Returns a two-tuple(result, success). The namespace `ns` is a dictionary
    containing text, match and/or arg variables.

    * If the object is an Item, returns ``object.pre_evaluate(ns)``.
    * If the object is a list or tuple, evaluates the contents and returns a
      tuple.
    * If the object is none of the above, simply returns the object unchanged.

    The ``success`` value can be one of the values described in
    :meth:`Item.pre_evaluate`, or it is 3, meaning that the object is returned
    unchanged and needs no evaluation.

    """
    if isinstance(obj, Item):
        return obj.pre_evaluate(ns)
    if type(obj) in (list, tuple):
        objs, success  = [], 3
        for o in obj:
            res, ok = pre_evaluate(o, ns)
            objs.append(res)
            success &= ok
        return tuple(objs), success
    return obj, 3


[docs]def pre_evaluate_rule(rule, arg):
    """Evaluate all RuleItem objects that can be evaluated in the rule.

    The specified ``arg`` provides the value for the ARG variable. Rule items
    that depend on the match object are not yet evaluated. Lists and tuples are
    unrolled. Returns the rule as a tuple.

    """
    ns = {'arg': arg}
    def pre_eval_rule_items(objs):
        for obj in objs:
            if isinstance(obj, RuleItem):
                yield from util.unroll(obj.pre_evaluate(ns)[0])
            elif type(obj) in (list, tuple):
                yield from pre_eval_rule_items(obj)
            else:
                yield obj
    result = pre_eval_rule_items(rule)
    # the first item may be a pattern instance; it should be evaluated by now
    for item in result:
        if isinstance(item, pattern):
            item = item.value
        return (item,) + tuple(result)
    return ()


[docs]def needs_evaluation(rule):
    """Return True if there are items in the rule that need evaluating."""
    for item in rule:
        if isinstance(item, PostponedItem):
            if needs_evaluation(item.evaluate_items()):
                return True
        elif isinstance(item, RuleItem) or \
             (type(item) in (tuple, list) and needs_evaluation(item)):
            return True
    return False


[docs]def variations_tree(rule):
    """Return a tuple with the tree structure of all possible variations.

    Branches (choices) are indicated by a frozenset, which contains
    zero or more tuples.

    """
    items = tuple(rule)
    for i, item in enumerate(items):
        if isinstance(item, Item):
            branch = [variations_tree(util.unroll(v)) for v in item.variations()]
            if branch:
                branch = branch[0] if len(branch) == 1 else frozenset(branch)
                return (*items[:i], branch, *variations_tree(items[i+1:]))
    else:
        return items


[docs]def variations(rule):
    """Yield all possible variations of the rule."""
    items = tuple(rule)
    for i, item in enumerate(items):
        if isinstance(item, Item):
            prefix = items[:i]
            for suffix in variations(items[i+1:]):
                for v in item.variations():
                    for l in variations(util.unroll(v)):
                        yield prefix + l + suffix
            break
    else:
        yield items


#: sentinel denoting that a variation is any integer
a_number = util.Symbol("a_number")

#: sentinel denoting that a variation is any string
a_string = util.Symbol("a_string")