Source code for parce.lexicon

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


r"""
A Lexicon groups rules to match.

A Lexicon is created by decorating a method yielding rules with the
:attr:`@lexicon <lexicon>` decorator. (Although this actually
creates a LexiconDescriptor. When a LexiconDescriptor is accessed for the first
time via a Language subclass, a Lexicon for that class is created and cached,
and returned each time that attribute is accessed.)

This makes it possible to inherit from a Language class and only re-implement
some lexicons, the others keep working as in the base class.

The Lexicon can parse text according to the rules. When its :func:`parse`
function is called for the first time, the rules-function is run with the
language class as argument, and the rules it yields are cached.

The Lexicon then combines the patterns of the rules into one regular expression
that is used to parse the text, using some smart optimizations. (For example,
when a lexicon has only one pattern rule which turns out to be an unambigious
string, :meth:`str.find` is used rather than using :py:func:`re.search`.)

Example:

    >>> from parce import Language, lexicon
    >>>
    >>> class MyLang(Language):
    ...     @lexicon
    ...     def numbers(cls):
    ...         yield r'\d+', "A number"
    ...         yield r'\w+', "A word"
    ...
    >>> MyLang.numbers
    MyLang.numbers
    >>> type(MyLang.numbers)
    <class 'parce.lexicon.Lexicon'>
    >>> for i in MyLang.numbers.parse("1 a2 d3 4 p 5", 0):
    ...  print(i)
    ...
    (0, '1', <re.Match object; span=(0, 1), match='1'>, 'A number', None)
    (2, 'a2', <re.Match object; span=(2, 4), match='a2'>, 'A word', None)
    (5, 'd3', <re.Match object; span=(5, 7), match='d3'>, 'A word', None)
    (8, '4', <re.Match object; span=(8, 9), match='4'>, 'A number', None)
    (10, 'p', <re.Match object; span=(10, 11), match='p'>, 'A word', None)
    (12, '5', <re.Match object; span=(12, 13), match='5'>, 'A number', None)

Parsing (better: lexing) is done by a :class:`~parce.lexer.Lexer` instance,
which switches Lexicon when a target is encountered.

"""

__all__ = ('Lexicon', 'LexiconDescriptor')

import itertools
import re
import threading

import parce.regex
from . import util
from .target import TargetFactory
from .ruleitem import (
    Item, RuleItem, evaluate_rule, needs_evaluation, pre_evaluate_rule)


[docs]class LexiconDescriptor:
    """The LexiconDescriptor creates a Lexicon when called via a class."""

    def __init__(self, rules_func,
                       re_flags=0,
                       consume=False,
        ):
        """Initializes with the rules function.

        The rules function accepts the Language class as argument, and yields
        the pattern, action, target, ... tuples.

        """
        self.rules_func = rules_func    #: the function yielding the rules
        self._re_flags = re_flags
        self._consume = consume
        self._lexicons = util.caching_dict(lambda owner: Lexicon(self, owner))

    def __get__(self, instance, owner):
        """Called when accessed as a descriptor, via the Language class."""
        return self._lexicons[owner]


[docs]class Lexicon:
    """A Lexicon parses text according to rules.

    A Lexicon is tied to a particular class, which makes it possible to inherit
    from a Language class and change only some Lexicons.

    .. py:function:: parse(text, pos)

        Start parsing ``text`` from the specified position.
        Yields five-tuples ``(pos, text, matchobj, action, target)``.

        The ``pos`` is the start position a match was found, ``text`` is the
        matched text, ``matchobj`` the match object (which can be None for
        default actions), ``action`` the action that was specified in the
        matching rule, and ``target`` is either None or a
        :class:`~parce.target.Target` object.

    """
    __hash__ = object.__hash__

    def __init__(self, descriptor, language, arg=None):
        #: The LexiconDescriptor this Lexicon was created by.
        self.descriptor = descriptor
        #: The Language class the lexicon belongs to.
        self.language = language
        #: The re_flags that were set on instantiation.
        self.re_flags = descriptor._re_flags
        #: Whether this lexicon wants the token(s) that switched to it
        self.consume = descriptor._consume
        # The argument the lexicon was called with (creating a derived
        # Lexicon). None for a normal lexicon.
        self._arg = arg
        #: The short name (name of the method this Lexicon was defined with)
        self.name = descriptor.rules_func.__name__
        #: The short name with the Language name prepended, like
        #: ``'Language.lexicon'``.
        self.fullname = language.__name__ + '.' + self.name
        #: The full name with the Language's module prepended, like
        #: ``'parce.lang.xml.Xml.root'``.
        self.qualname = language.__module__ + '.' + self.fullname
        self.__doc__ = descriptor.rules_func.__doc__
        # lock is used when creating the parse() instance function
        self._lock_build = threading.Lock()

    @property
    def arg(self):
        """The argument the lexicon was called with (creating a derived
        Lexicon). None for a normal lexicon.

        """
        return self._arg

[docs]    def __call__(self, arg=None):
        """Create a derived Lexicon with argument ``arg``.

        The argument should be a simple, hashable singleton object, such as a
        string, an integer or a standard action. The created Lexicon is cached.
        The argument is accessible using special pattern and rule item types,
        so a derived Lexicon can parse text based on rules that are defined at
        parse time, which is useful for things like here documents, where you
        only get to know the end token after the start token has been found.

        When comparing Lexicons with ``==``, a derived lexicon compares equal
        with the Lexicon that created it, although they co-exist as separate
        objects. Use ``is`` to compare on identity.

        When yielding the rules from a derived lexicon, the dynamic rule items
        that depend on the Lexicon argument are already evaluated. When
        yielding the rules from a vanilla lexicon, they are not evaluated, so
        they adjust themselves to the lexicon they are included in (which will
        then evaluate the rules of course).

        If arg is None, self is returned.

        """
        if arg is None:
            return self
        elif self.arg is not None:
            vanilla = self.descriptor.__get__(None, self.language)
            return vanilla(arg)
        return self._derive(arg)

    @util.cached_method
    def _derive(self, arg):
        """Factory, called when a derived lexicon needs to be created."""
        return Lexicon(self.descriptor, self.language, arg)

    def __eq__(self, other):
        """Return True if we are the same lexicon or a derivate from the same."""
        if type(other) is type(self):
            return self.descriptor is other.descriptor and self.language is other.language
        return NotImplemented

    def __ne__(self, other):
        """Return True if we are the not the same lexicon or a derivate from the same."""
        if type(other) is type(self):
            return self.descriptor is not other.descriptor or self.language is not other.language
        return NotImplemented

    @util.cached_property
    def _rules(self):
        """Return all rules in a tuple.

        Rule items that depend on the lexicon argument are only evaluated if
        this is a derived lexicon.

        """
        rules = self.descriptor.rules_func(self.language) or ()
        if self.arg is not None:
            rules = (pre_evaluate_rule(rule, self.arg) for rule in rules)
        return tuple(rules)

    @util.cached_property
    def rules(self):
        """Return all rules in a tuple.

        Rule items that depend on the lexicon argument are already evaluated.

        """
        return tuple(pre_evaluate_rule(rule, self.arg)
                for rule in self.descriptor.rules_func(self.language) or ())

[docs]    def __iter__(self):
        """Yield the rules.

        Patterns are created when this method is called for the first time. If
        this is a derived lexicon, dynamic rule items that depend on the
        argument are already evaluated.

        """
        yield from self._rules

    def __repr__(self):
        s = self.fullname
        if self.arg is not None:
            s += '*'
        return s

    def __getattr__(self, name):
        """Called when ``self.parse(text, pos)`` is requested the first time.

        Calls :meth:`_get_parse_function` to get the parse function.

        """
        if name in ("parse",):
            with self._lock_build:
                try:
                    return object.__getattribute__(self, name)
                except AttributeError:
                    self.parse = self._get_parse_function()
        return object.__getattribute__(self, name)

    def _get_parse_function(self):
        """Compile the pattern rules and return the parse function."""
        patterns = []
        rules = []
        no_default_action = object()
        default_action = no_default_action
        default_target = None

        make_target = TargetFactory.make

        # make lists of pattern, action and possible targets
        for pattern, *rule in self.rules:
            if pattern is parce.default_action:
                default_action = rule[0]
            elif pattern is parce.default_target:
                default_target = make_target(self, rule)
            elif rule and pattern is not None and pattern not in patterns:
                # skip rule when the pattern is None or already seen
                patterns.append(pattern)
                rules.append(rule)

        # prepare to handle a dynamic default action
        if isinstance(default_action, RuleItem):
            def dynamic_default_action(text):
                return default_action.evaluate({'text': text})
        else:
            dynamic_default_action = False

        # handle the empty lexicon case
        if not patterns:
            if dynamic_default_action:
                def parse(text, pos):
                    """Parse text, using a dynamic default action for unknown text."""
                    if pos < len(text):
                        t = text[pos:]
                        yield pos, t, None, dynamic_default_action(t), None
            elif default_action is not no_default_action:
                def parse(text, pos):
                    """Parse text, using a default action for unknown text."""
                    if pos < len(text):
                        yield pos, text[pos:], None, default_action, None
            elif default_target:
                def parse(text, pos):
                    """Parse text, stopping with the default target at unknown text."""
                    if pos < len(text):
                        yield pos, "", None, None, default_target
            else:
                # just quit parsing
                def parse(text, pos):
                    """Parse text, skipping unknown text."""
                    return
                    yield
            return parse

        # if there is only one pattern, and no dynamic action or target,
        # see if the pattern is simple enough to just use str.find
        if len(patterns) == 1 and not self.re_flags & re.IGNORECASE and \
                not needs_evaluation(rules[0]):
            needle = parce.regex.to_string(patterns[0])
            if needle:
                l = len(needle)
                action, *rule = rules[0]
                target = make_target(self, rule)
                if dynamic_default_action:
                    def parse(text, pos):
                        """Parse text, using a dynamic default action for unknown text."""
                        while True:
                            i = text.find(needle, pos)
                            if i > pos:
                                t = text[pos:i]
                                yield pos, t, None, dynamic_default_action(t), None
                            elif i == -1:
                                break
                            yield i, needle, None, action, target
                            pos = i + l
                        if pos < len(text):
                            t = text[pos:]
                            yield pos, t, None, dynamic_default_action(t), None
                elif default_action is not no_default_action:
                    def parse(text, pos):
                        """Parse text, using a default action for unknown text."""
                        while True:
                            i = text.find(needle, pos)
                            if i > pos:
                                yield pos, text[pos:i], None, default_action, None
                            elif i == -1:
                                break
                            yield i, needle, None, action, target
                            pos = i + l
                        if pos < len(text):
                            yield pos, text[pos:], None, default_action, None
                elif default_target:
                    def parse(text, pos):
                        """Parse text, stopping with the default target at unknown text."""
                        while needle == text[pos:pos+l]:
                            yield pos, needle, None, action, target
                            pos += l
                        if pos < len(text):
                            yield pos, "", None, None, default_target
                else:
                    def parse(text, pos):
                        """Parse text, skipping unknown text."""
                        while True:
                            i = text.find(needle, pos)
                            if i == -1:
                                break
                            yield i, needle, None, action, target
                            pos = i + l
                return parse

        # compile the regexp for all patterns
        rx = re.compile("|".join("(?P<g_{0}>{1})".format(i, pattern)
            for i, pattern in enumerate(patterns)), self.re_flags)
        # make a fast mapping list from matchObj.lastindex to the rules.
        # rules that contain Item instances are put in the dynamic index
        indices = sorted(v for k, v in rx.groupindex.items() if k.startswith('g_'))
        static = [None] * (indices[-1] + 1)
        dynamic = [None] * (indices[-1] + 1)
        for i, rule in zip(indices, rules):
            if needs_evaluation(rule):
                dynamic[i] = rule
            else:
                action, *target = rule
                static[i] = (action, make_target(self, target))

        # for rule containing no dynamic stuff, static has the rule, otherwise
        # falls back to dynamic, which is then immediately executed
        def token(m):
            """Return pos, text, match, *rule for the match object."""
            return (m.start(), m.group(), m, *(static[m.lastindex] or replace(m)))

        def replace(m):
            """Recursively replace dynamic rule items in the rule pointed to by match object."""
            action, *target = evaluate_rule(dynamic[m.lastindex], m)
            return action, make_target(self, target)

        if dynamic_default_action:
            finditer = rx.finditer
            def parse(text, pos):
                """Parse text, using a dynamic default action for unknown text."""
                for m in finditer(text, pos):
                    if m.start() > pos:
                        t = text[pos:m.start()]
                        yield pos, t, None, dynamic_default_action(t), None
                    yield token(m)
                    pos = m.end()
                if pos < len(text):
                    t = text[pos:]
                    yield pos, t, None, dynamic_default_action(t), None
        elif default_action is not no_default_action:
            finditer = rx.finditer
            def parse(text, pos):
                """Parse text, using a default action for unknown text."""
                for m in finditer(text, pos):
                    if m.start() > pos:
                        yield pos, text[pos:m.start()], None, default_action, None
                    yield token(m)
                    pos = m.end()
                if pos < len(text):
                    yield pos, text[pos:], None, default_action, None
        elif default_target:
            match = rx.match
            def parse(text, pos):
                """Parse text, stopping with the default target at unknown text."""
                while True:
                    m = match(text, pos)
                    if m:
                        yield token(m)
                        pos = m.end()
                    else:
                        if pos < len(text):
                            yield pos, "", None, None, default_target
                        break
        else:
            finditer = rx.finditer
            def parse(text, pos):
                """Parse text, skipping unknown text."""
                return map(token, finditer(text, pos))
        return parse