Source code for parce.lang.scheme

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Scheme.

Tries to adhere to the official Scheme syntax, especially the complicated
number syntax. See for more information:

* https://www.gnu.org/software/guile/manual/r5rs.html#Formal-syntax
* https://www.scheme.com/tspl4/grammar.html

Besides the :class:`Scheme` language definition and a subclass
:class:`SchemeLily` that is used for Scheme code embedded into LilyPond, there
is a helper function :func:`scheme_number` that parses the Scheme number tokens
into a Python value. This function can be used when transforming/parsing the
Scheme tokens into some data model that access the Scheme values.

"""

__all__ = ('Scheme', 'SchemeLily', 'scheme_number', 'scheme_number_from_text')

import re


from parce import Language, lexicon, skip, default_action, default_target
from parce.action import (
    Bracket, Character, Comment, Delimiter, Keyword, Name, Number, Operator,
    Separator, String,
)
from parce.rule import (
    ARG, MATCH, TEXT, bygroup, call, dselect, findmember, ifarg, ifmember,
    pattern,
)


RE_SCHEME_RIGHT_BOUND = r"(?=$|[()\s;]|#\()"

RE_SCHEME_ID_SPECIAL_INITIAL = r'!$%&*/:<=>?^_~'
RE_SCHEME_ID_INITIAL = r'(?:[^\W\d]|[' + RE_SCHEME_ID_SPECIAL_INITIAL + '])'
RE_SCHEME_ID_SUBSEQUENT = r'[\w' + RE_SCHEME_ID_SPECIAL_INITIAL + '@.+-]'
RE_SCHEME_ID_PECULIAR = r'[-+]|\.{3}'

RE_SCHEME_ID = r'(?:' + \
    RE_SCHEME_ID_PECULIAR + \
    '|' + RE_SCHEME_ID_INITIAL + '(?:' + RE_SCHEME_ID_SUBSEQUENT + ')*' + \
    ')' + RE_SCHEME_RIGHT_BOUND


[docs]class Scheme(Language):
    @lexicon
    def root(cls):
        yield from cls.common()

[docs]    @classmethod
    def common(cls, pop=0):
        """Yield common stuff. ``pop`` can be set to -1 for one-arg mode."""
        yield r"['`]|,@?", Delimiter.Scheme.Quote
        yield r"\(", Delimiter.OpenParen, pop, cls.list
        yield r"#\(", Delimiter.OpenVector, pop, cls.vector
        yield r'"', String, pop, cls.string
        yield r';', Comment, pop, cls.singleline_comment
        yield r'#!', Comment, pop, cls.multiline_comment

        yield r"#[tTfF]\b", Number.Boolean, pop
        yield r"#\\([a-z]+|.)", Character, pop
        yield RE_SCHEME_ID, cls.get_word_action(), pop

        _g = lambda action: bygroup(Number.Prefix, action, skip, Number.Prefix)
        yield r'(#[eEiI])?(#([bBoOxXdD]))(#[eEiI])?', findmember(MATCH[3], (
            ('bB', (_g(Number.Prefix.Binary), pop, cls.number(2))),
            ('oO', (_g(Number.Prefix.Octal), pop, cls.number(8))),
            ('xX', (_g(Number.Prefix.Hexadecimal), pop, cls.number(16)))),
               (_g(Number.Prefix.Decimal), pop, cls.number))
        yield r'#[eEiI]', Number.Prefix, pop, cls.number
        yield r'[-+]inf.0', Number.Infinity, pop, cls.number
        yield r'[-+]nan.0', Number.NaN, pop, cls.number
        yield r'[-+]', Operator.Sign, pop, cls.number
        yield r'(\.?)(\d+)', bygroup(Number.Dot, Number.Decimal), pop, cls.number

        if pop == 0:
            yield r"\.(?!\S)", Delimiter.Dot

    @lexicon(consume=True)
    def list(cls):
        yield r"\)", Delimiter.CloseParen, -1
        yield from cls.common()

    @lexicon(consume=True)
    def vector(cls):
        yield r"\)", Delimiter.CloseVector, -1
        yield from cls.common()

[docs]    @classmethod
    def get_word_action(cls):
        """Return a dynamic action that is chosen based on the text."""
        from . import scheme_words
        return ifmember(TEXT, scheme_words.keywords, Keyword, Name)

    # -------------- Number ---------------------
    @lexicon(consume=True, re_flags=re.I)
    def number(self):
        """Decimal numbers, derive with 2 for binary, 8 for octal, 16 for hexadecimal numbers."""
        yield RE_SCHEME_RIGHT_BOUND, None, -1
        _pat = lambda radix: '[{}]+'.format('0123456789abcdef'[:radix or 10])
        yield pattern(call(_pat, ARG)), \
            dselect(ARG, {2: Number.Binary, 8: Number.Octal, 16: Number.Hexadecimal}, Number.Decimal)
        yield r'[-+]inf.0', Number.Infinity
        yield r'[-+]nan.0', Number.NaN
        yield r'[-+]', Operator.Sign
        yield 'i', Number.Imaginary
        yield ifarg(None, '([esfdl])([-+])?'), bygroup(Number.Exponent, Operator.Sign)
        yield ifarg(None, r'\.'), Number.Dot
        yield '@', Separator.Polar
        yield '/', Separator.Fraction
        yield '#+', Number.Special.UnknownDigit
        yield default_action, Number.Invalid

    # -------------- String ---------------------
    @lexicon(consume=True)
    def string(cls):
        yield r'"', String, -1
        yield from cls.string_common()

[docs]    @classmethod
    def string_common(cls):
        yield r'\\[\\"|afnrtvb]', String.Escape
        yield default_action, String

    # -------------- Comment ---------------------
    @lexicon(consume=True)
    def multiline_comment(cls):
        yield r'!#', Comment, -1
        yield from cls.comment_common()

    @lexicon(re_flags=re.MULTILINE, consume=True)
    def singleline_comment(cls):
        yield from cls.comment_common()
        yield r'$', Comment, -1


[docs]class SchemeLily(Scheme):
    """Scheme used with LilyPond."""
    @lexicon(consume=True)
    def scheme(cls):
        """Pick one thing and pop back."""
        yield r'\s+', skip
        yield from cls.common(cls.argument)
        yield default_target, -1

    @lexicon(consume=True)
    def argument(cls):
        """One Scheme expression."""
        yield default_target, -2

[docs]    @classmethod
    def common(cls, pop=0):
        from . import lilypond
        yield r"#{", Bracket.LilyPond.Start, pop, lilypond.LilyPond.schemelily
        yield from super().common(pop)


[docs]def scheme_number(tokens):
    """Return the Python value of the Scheme number in the specified tokens
    iterable.

    All ``tokens`` that can be in the :meth:`Scheme.number` context are
    supported. Supports all features: nan, +/- inf, fractions, exactness,
    complex numbers and polar coordinates.

    Raises ValueError or ZeroDivisionError on faulty input.

    Usage example::

        >>> text = '123'
        >>> from parce import root
        >>> for n in root(Scheme.root, text):
        ...     if n == Scheme.number:
        ...         value = scheme_number(n)
        ...

    """

    import cmath, fractions, math
    from parce.util import split_list

    _radix_map = {
        'b': (2, Number.Binary),
        'o': (8, Number.Octal),
        'd': (10, Number.Decimal),
        'x': (16, Number.Hexadecimal),
    }

    mantisse_action, radix = Number.Decimal, 10
    exact = None

    def get_uint(tokens):
        """Get an unsigned integer from the tokens.

        Returns a float when there were unknown digits (``#``) and there was
        no exact prefix (``#e``)

        """
        if not tokens:
            raise ValueError("expecting unsigned integer (radix: {})".format(radix))
        v = 0
        for t in tokens:
            if t.action is mantisse_action:
                v = int(t.text, radix)
            elif t.action is Number.Special.UnknownDigit:
                v *= radix * len(t.text)
                return float(v) if not exact else v
            else:
                raise ValueError("unknown token in radix {}: {}".format(radix, repr(t.text)))
        return v

    def get_decimal10(tokens):
        """Get a decimal10 value from the tokens. Only called in decimal mode."""
        v = []
        e = True
        i, z = 0, len(tokens)
        while i < z:
            t = tokens[i]
            if t.action is Number.Decimal:
                v.append(t.text)
            elif t.action is Number.Special.UnknownDigit:
                v.append('0' * len(t.text))
            elif e and t.action is Number.Dot:
                v.append('.')
                e = False
            elif t.action is Number.Exponent:
                e = False
                v.append('e')
                i += 1
                while i < z:
                    t = tokens[i]
                    if t.action is Operator.Sign:
                        v.append(t.text)
                    elif t.action is Number.Decimal:
                        v.append(t.text)
                        break
                    else:
                        raise ValueError("unknown token in exponent: {}".format(repr(t.text)))
                    i += 1
                else:
                    raise ValueError("missing exponent")
                break
            else:
                raise ValueError("unknown token in decimal 10: {}".format(repr(t.text)))
            i += 1
        s = ''.join(v)
        if s:
            if e:
                return float(s) if exact is False else int(s)
            return fractions.Fraction(s) if exact else float(s)
        raise ValueError("expecting decimal value")

    def get_real(tokens):
        """Return a real value from the tokens (can be int, float or Fraction.)."""
        # get a sign, inf or nan
        i, z = 0, len(tokens)
        sign = 1
        while i < z:
            t = tokens[i]
            if t.action is Operator.Sign:
                if t == '-':
                    sign *= -1
            elif t.action is Number.Infinity:
                return math.inf if t.text[0] == '+' else -math.inf
            elif t.action is Number.NaN:
                return math.nan
            else:
                break
            i += 1
        # now, get either uint, uint/uint or decimal10
        tokens, *fract = split_list(tokens[i:], '/')
        if fract:
            numerator = get_uint(tokens)
            denominator = get_uint(fract[0])
            if isinstance(numerator, float) or isinstance(denominator, float) or exact is False:
                v = numerator / denominator
            else:
                v = fractions.Fraction(numerator, denominator)
        elif radix == 10:
            v = get_decimal10(tokens)
        else:
            v = get_uint(tokens)
        return sign * v

    def get_complex(tokens):
        """Return a complex value from the tokens."""
        # find the imaginary part
        i = len(tokens) - 2
        while i >= 0:
            t = tokens[i]
            if t.action in (Number.Infinity, Number.NaN):
                imag = get_real(tokens[i:-1])
                break
            elif t.action is Operator.Sign and t.group is None:
                # (for a -/+ sign after an exponent, t.group is -1)
                if i == len(tokens) - 2:
                    imag = 1 if t == '+' else -1    # the +i or -i case
                else:
                    imag = get_real(tokens[i:-1])
                break
            i -= 1
        else:
            raise ValueError("invalid complex number")
        real = get_real(tokens[:i]) if i else 0
        return complex(real, imag)

    ### main function body
    tokens = list(tokens)

    # get the prefixes
    i, z = 0, len(tokens)
    while i < z:
        t = tokens[i]
        if t.action in Number.Prefix:
            p = t.text[1].lower()
            if p == 'i':
                exact = False
            elif p == 'e':
                exact = True
            else:
                radix, mantisse_action = _radix_map[p]
        else:
            break
        i += 1

    tokens, *polar = split_list(tokens[i:], '@')

    if polar:
        return cmath.rect(get_real(tokens), get_real(polar[0]))
    elif tokens and tokens[-1].text.lower() == 'i':
        return get_complex(tokens)
    return get_real(tokens)


[docs]def scheme_number_from_text(text):
    """Proof-of-concept/test function parsing Scheme/Guile number syntax.

    Usage::

        >>> from parce.lang.scheme import scheme_number_from_text
        >>> scheme_number_from_text('123')
        123
        >>> scheme_number_from_text('123+3i')
        (123+3j)
        >>> scheme_number_from_text('#x123')
        291
        >>> scheme_number_from_text('#o13')
        11
        >>> scheme_number_from_text('1/3')
        Fraction(1, 3)
        >>> scheme_number_from_text('#i1/3')
        0.3333333333333333
        >>> scheme_number_from_text('#xdead/beef')
        Fraction(57005, 48879)

    Raises ValueError or ZeroDivisionError on invalid input.

    """
    from parce import root
    for n in root(Scheme.root, text):
        if n == Scheme.number:
            return scheme_number(n)
    raise ValueError("invalid number: {}".format(repr(text)))


def scheme_is_indenting_keyword(text):
    """Return True if the keyword ``text`` should cause the next line to indent
    normally, instead of aligning with previous line.

    The words in the list below and those that start with "def", like "define",
    do not follow the standard Scheme indentation patterns.

    The list below and the "def" rule are from GNU Emacs source code,
    which sets the standard for GNU Guile Scheme indentation.

    See: https://git.savannah.gnu.org/cgit/emacs.git/tree/lisp/progmodes/scheme.el
    See also: http://community.schemewiki.org/?scheme-style

    """
    return text.startswith('def') or text in (
        'begin',
        'call-with-input-file',
        'call-with-output-file',
        'call-with-values',
        'case',
        'delay',
        'do',
        'dynamic-wind',
        'lambda',
        'let',
        'let*',
        'letrec',
        'letrec*',
        'letrec-syntax',
        'let-syntax',
        'let*-values',
        'let-values',
        'library',
        'parameterize',
        'receive',
        'sequence',
        'syntax-case',
        'syntax-rules',
        'unless',
        'when',
        'with-input-from-file',
        'with-input-from-port',
        'with-output-to-file',
        'with-output-to-port',
    )