Source code for parce.lang.python

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Parse Python.

"""

__all__ = ('Python', 'PythonConsole')

import re

from parce import lexicon, Language, skip, default_action, default_target
from parce.action import (
    Character, Comment, Data, Delimiter, Escape, Invalid, Keyword, Literal,
    Name, Number, Operator, String, Whitespace)
from parce.rule import (
    ARG, MATCH, TEXT, arg, bygroup, call, derive, dselect, findmember, ifarg,
    ifeq, ifgroup, ifmember, pattern, select, words)

from . import python_words


RE_PYTHON_IDENTIFIER = _I_ = r'[^\W\d]\w*'
RE_PYTHON_HORIZ_SPACE = _S_ = r'[^\S\n]'
RE_PYTHON_LINE_CONTINUATION = _N_ = r'\\\n'
_SN_ = fr'(?:{_S_}|{_N_})'


Bytes = Data.Bytes


[docs]class Python(Language):
    @lexicon(re_flags=re.MULTILINE)
    def root(cls):
        yield fr'^{_S_}+($|(?=#))?', ifgroup(1, Whitespace, Whitespace.Indent)
        yield r'@', Name.Decorator, cls.decorator
        yield fr'(class\b){_S_}*({_I_})', bygroup(Keyword,
            ifmember(MATCH[2], python_words.keywords, Invalid, Name.Class.Definition)), cls.classdef
        yield fr'(def\b){_S_}*({_I_})', bygroup(Keyword,
            ifmember(MATCH[2], python_words.keywords, Invalid, Name.Function.Definition)), cls.funcdef
        yield fr':(?={_S_}*(?:$|#))', Delimiter.Indent
        yield fr'({_I_})\s*(=)', bygroup(
            select(call(str.isupper, TEXT),
                   select(call(isclassname, TEXT), Name.Variable, Name.Class),
                   Name.Constant),
            Operator.Assignment)
        yield from cls.common()

[docs]    @classmethod
    def common(cls):
        yield r'#', Comment, cls.comment
        yield fr'({_N_})(\s*)', bygroup(Escape, Whitespace)
        yield r'\[', Delimiter, cls.list
        yield r'\(', Delimiter, cls.tuple
        yield r'\{', Delimiter, cls.dict

        ## string literals
        yield from cls.find_string_literals()
        yield from cls.find_bytes_literals()

        ## numerical values
        yield '0[oO](?:_?[0-7])+', Number.Octal
        yield '0[bB](?:_?[01])+', Number.Binary
        yield '0[xX](?:_?[0-9a-fA-F])+', Number.Hexadecimal
        yield r'(?:\.\d(?:_?\d)*|\d(?:_?\d)*(?:\.(?:\d(?:_?\d)*)?)?)(?:[eE][-+]\d(?:_?\d)*)?[jJ]?', Number

        ## keywords, variables, functions
        yield words(python_words.keywords, prefix=r'\b', suffix=r'\b'), Keyword
        yield words(python_words.constants, prefix=r'\b', suffix=r'\b'), Name.Constant
        yield fr'\b(self|cls)\b(?:{_SN_}*([\[\(]))?', Name.Variable.Special, \
            dselect(MATCH[2], {'(': cls.call, '[': cls.item})
        # method, class or attribute (keywords after a . are also caught)
        yield fr'(\.){_SN_}*\b({_I_})\b(?:{_SN_}*([\[\(]))?', \
            bygroup(
                Delimiter,
                ifmember(MATCH[2], python_words.keywords,
                    Keyword,
                    dselect(MATCH[3], {'(': select(call(isclassname, TEXT), Name.Method, Name.Class)},
                         select(call(str.isupper, TEXT),
                             select(call(isclassname, TEXT), Name.Attribute, Name.Class),
                             Name.Constant))),
                Delimiter), \
            dselect(MATCH[3], {'(': cls.call, '[': cls.item})
        # function, class or variable
        yield fr'\b({_I_})\b(?:{_SN_}*([\[\(]))?', \
            bygroup(
                findmember(MATCH[1],
                    ((python_words.builtins, Name.Builtin),
                     (python_words.exceptions, Name.Exception)),
                    select(call(str.isupper, TEXT),
                        select(call(isclassname, TEXT),
                            dselect(MATCH[2], {'(': Name.Function}, Name.Variable),
                            Name.Class),
                        Name.Constant)),
                Delimiter), \
            dselect(MATCH[2], {'(': cls.call, '[': cls.item})

        ## delimiters, operators
        yield r'\.\.\.', Delimiter.Special.Ellipsis
        yield r'(?:\*\*|//|<<|>>|[-+*/%@&|^:])?=', Operator.Assignment
        yield r'\*\*|//|<<|>>|[<>=!]=|[-+*/%@&|^~<>]', Operator
        yield r'[.;,:]', Delimiter

    @lexicon(re_flags=re.MULTILINE)
    def decorator(cls):
        """A decorator."""
        yield _I_, Name.Decorator
        yield r'\[', Delimiter, cls.item
        yield r'\(', Delimiter, cls.call
        yield r'\.', Delimiter
        yield '$', None, -1
        yield r'\\\n', Escape
        yield r'#', Comment, -1, cls.comment

    @lexicon
    def funcdef(cls):
        """A function definition."""
        yield r'\(', Delimiter, cls.signature
        yield r'->', Delimiter.Annotation
        yield r':', Delimiter.Indent, -1
        yield r'#', Comment, -1, cls.comment
        yield from cls.common()

    @lexicon
    def signature(cls):
        """A function signature."""
        yield r'\)', Delimiter, -1
        yield r':', Delimiter.Annotation
        yield from cls.common()

    @lexicon
    def classdef(cls):
        """A class definition."""
        yield r'\(', Delimiter, cls.bases
        yield ":", Delimiter.Indent, -1
        yield r'#', Comment, -1, cls.comment
        yield from cls.common()

    @lexicon
    def bases(cls):
        """The base classes in a class definition."""
        yield r'\)', Delimiter, -1
        yield from cls.common()

    ## ------ expressions -----------
    @lexicon
    def item(cls):
        """Stuff between xxx[ and ] (getitem)."""
        yield r'\]', Delimiter, -1
        yield from cls.common()

    @lexicon
    def call(cls):
        """Stuff between xxx( and ) (call)."""
        yield r'\)', Delimiter, -1
        yield from cls.common()

    ## ----- item types -------------
    @lexicon
    def list(cls):
        yield r'\]', Delimiter, -1
        yield ',', Delimiter
        yield from cls.common()

    @lexicon
    def tuple(cls):
        yield r'\)', Delimiter, -1
        yield ',', Delimiter
        yield from cls.common()

    @lexicon
    def dict(cls):
        yield r'\}', Delimiter, -1
        yield '[,:]', Delimiter
        yield from cls.common()

    ## ------- strings --------------
[docs]    @classmethod
    def find_string_literals(cls, target=None, allow_newlines=None):
        """Find string literals."""
        # short strings not closed on the same line are invalid
        yield r'''[rRuUfF]{,2}["']$''', String.Invalid

        if target is None:
            target = cls.string(allow_newlines)

        # long strings
        yield r'(\b[rR])("""|'r"''')", \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.long_string_raw, MATCH[2])
        yield r'(\b(?:[fF][rR])|(?:[rR][fF]))("""|'r"''')", \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.long_string_raw_format, MATCH[2])
        yield r'(\b[uU])?("""|'r"''')", \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.long_string, MATCH[2])
        yield r'(\b[fF])("""|'r"''')", \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.long_string_format, MATCH[2])

        # short strings
        yield r'''(\b[rR])(['"])''', \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.short_string_raw, MATCH[2])
        yield r'''(\b(?:[fF][rR])|(?:[rR][fF]))(['"])''', \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.short_string_raw_format, MATCH[2])
        yield r'''(\b[uU])?(['"])''', \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.short_string, MATCH[2])
        yield r'''(\b[fF])(['"])''', \
            bygroup(String.Prefix, String.Start), \
            target, derive(cls.short_string_format, MATCH[2])

    @lexicon
    def string(cls):
        """All strings end here, check [slice] notation and concatenated literals."""
        yield _N_, Escape
        yield ifarg(r'\s+', r'[ \t]+'), skip    # allow newline inside arglists, tuples, etc
        yield from cls.find_string_literals(0)
        yield r'\[', Delimiter, cls.item
        yield default_target, -1

    @lexicon(re_flags=re.MULTILINE)
    def short_string(cls):
        yield from cls.string_escape()
        yield from cls.short_string_common()

    @lexicon(re_flags=re.MULTILINE)
    def short_string_raw(cls):
        yield from cls.short_string_raw_common()

    @lexicon(re_flags=re.MULTILINE)
    def short_string_format(cls):
        yield from cls.string_formatstring()
        yield from cls.string_escape()
        yield from cls.short_string_common()

    @lexicon(re_flags=re.MULTILINE)
    def short_string_raw_format(cls):
        yield from cls.string_formatstring()
        yield from cls.short_string_raw_common()

[docs]    @classmethod
    def short_string_common(cls):
        yield arg(), String.End, -1
        yield pattern(ifeq(ARG, "'", r"[^']*?$", r'[^"]*?$')), String.Invalid, -1
        yield default_action, String

[docs]    @classmethod
    def short_string_raw_common(cls):
        yield arg(), String.End, -1
        yield r'\\\\', String
        yield pattern(ifeq(ARG, "'", fr"([^\\']*?|\\'{_S_}*)$", fr'([^\\"]*?|\\"{_S_}*)$')), String.Invalid, -1
        yield arg(prefix=r'\\'), String  # escape quote, but the \ remains
        yield default_action, String

    @lexicon
    def long_string(cls):
        yield from cls.string_escape()
        yield from cls.long_string_common()

    @lexicon
    def long_string_raw(cls):
        yield arg(prefix=r'\\'), String  # escape quote, but the \ remains
        yield from cls.long_string_common()

    @lexicon
    def long_string_format(cls):
        yield from cls.string_formatstring()
        yield from cls.string_escape()
        yield from cls.long_string_common()

    @lexicon
    def long_string_raw_format(cls):
        yield arg(prefix=r'\\'), String  # escape quote, but the \ remains
        yield from cls.string_formatstring()
        yield from cls.long_string_common()

[docs]    @classmethod
    def long_string_common(cls):
        yield arg(), String.End, -1
        yield default_action, String

    # ------ stuff common for short and long strings ---------
[docs]    @classmethod
    def string_escape(cls):
        yield from cls.bytes_escape(String.Escape)
        yield r'\\N\{[^\}]+\}', String.Escape
        yield r'\\u[0-9a-fA-F]{4}', String.Escape
        yield r'\\U[0-9a-fA-F]{8}', String.Escape

[docs]    @classmethod
    def string_formatstring(cls):
        yield r'\{\{|\}\}', String.Escape
        yield r'\{', Delimiter.Template, cls.string_format_expr

    @lexicon
    def string_format_expr(cls):
        yield '![sra]', Character
        yield ':', Delimiter, cls.string_format_spec
        yield r'\}', Delimiter.Template, -1
        yield from cls.common()

    @lexicon
    def string_format_spec(cls):
        yield r'\{', Delimiter, cls.string_format_spec_nested
        yield r'\}', Delimiter.Template, -2
        yield from cls.common() # TODO maybe really parse format strings

    @lexicon
    def string_format_spec_nested(cls):
        yield r'\}', Delimiter, -1
        yield from cls.common()

    # ----------------- bytes --------------------
[docs]    @classmethod
    def find_bytes_literals(cls, target=None, allow_newlines=None):
        """Find bytes literals."""
        # short bytes not closed on the same line are invalid
        yield r'''[rRbB]{,2}["']$''', Bytes.Invalid

        if target is None:
            target = cls.bytes(allow_newlines)

        # long bytes
        yield r'(\b(?:[bB][rR])|(?:[rR][bB]))("""|'r"''')", \
            bygroup(Bytes.Prefix, Bytes.Start), \
            target, derive(cls.long_bytes_raw, MATCH[2])
        yield r'(\b[bB])("""|'r"''')", \
            bygroup(Bytes.Prefix, Bytes.Start), \
            target, derive(cls.long_bytes, MATCH[2])

        # short bytes
        yield r'''(\b(?:[bB][rR])|(?:[rR][bB]))(['"])''', \
            bygroup(Bytes.Prefix, Bytes.Start), \
            target, derive(cls.short_bytes_raw, MATCH[2])
        yield r'''(\b[bB])(['"])''', \
            bygroup(Bytes.Prefix, Bytes.Start), \
            target, derive(cls.short_bytes, MATCH[2])

    @lexicon
    def bytes(cls):
        """All bytes end here, check [slice] notation and concatenated literals."""
        yield _N_, Escape
        yield ifarg(r'\s+', r'[ \t]+'), skip    # allow newline inside arglists, tuples, etc
        yield from cls.find_bytes_literals(0)
        yield r'\[', Delimiter, cls.item
        yield default_target, -1

    @lexicon(re_flags=re.MULTILINE)
    def short_bytes(cls):
        yield from cls.bytes_escape()
        yield from cls.short_bytes_common()

    @lexicon(re_flags=re.MULTILINE)
    def short_bytes_raw(cls):
        yield from cls.short_bytes_raw_common()

    @lexicon
    def long_bytes(cls):
        yield from cls.bytes_escape()
        yield from cls.long_bytes_common()

    @lexicon
    def long_bytes_raw(cls):
        yield from cls.long_bytes_common()

[docs]    @classmethod
    def short_bytes_common(cls):
        yield arg(), Bytes.End, -1
        yield pattern(ifeq(ARG, "'", r"[^']*?$", r'[^"]*?$')), Bytes.Invalid, -1
        yield default_action, Bytes

[docs]    @classmethod
    def short_bytes_raw_common(cls):
        yield r'\\\\', Bytes
        yield pattern(ifeq(ARG, "'", fr"([^\\']*?|\\'{_S_}*)$", fr'([^\\"]*?|\\"{_S_}*)$')), Bytes.Invalid, -1
        yield arg(prefix=r'\\'), Bytes  # escape quote, but the \ remains
        yield from cls.long_bytes_common()

[docs]    @classmethod
    def long_bytes_common(cls):
        yield arg(), Bytes.End, -1
        yield default_action, Bytes

[docs]    @classmethod
    def bytes_escape(cls, action=Bytes.Escape):
        yield r'''\\[\n\\'"abfnrtv]''', action
        yield r'\\\d{1,3}', action
        yield r'\\x[0-9a-fA-F]{2}', action

    ## ------- comments -------------
    @lexicon(re_flags=re.MULTILINE)
    def comment(cls):
        yield from cls.comment_common()
        yield r'$', Comment, -1



RE_PYTHON_PROMPT = r'(?:(?<=\n)|^)(?:>>>|\.\.\.)(?: |$|(?=\n))'
RE_PYTHON_NO_PROMPT = r'^((?!^(>>>|\.\.\.)).)*$'
RE_PYTHON_CONTINUATION_PROMPT = r'(?:(?<=\n)|^)\.\.\.(?: |$|(?=\n))'


[docs]class PythonConsole(Python):
    """Python console input and output with prompt."""
    @lexicon(re_flags=re.MULTILINE)
    def root(cls):
        yield r'(?=^Traceback \(most recent call last\):)', Literal.Error, cls.traceback
        yield RE_PYTHON_NO_PROMPT, Literal.Output
        yield from super().root

[docs]    @classmethod
    def common(cls):
        yield RE_PYTHON_PROMPT, Literal.Prompt
        yield from super().common()

[docs]    @classmethod
    def long_string_common(cls):
        yield RE_PYTHON_CONTINUATION_PROMPT, Literal.Prompt
        yield from super().long_string_common()

[docs]    @classmethod
    def long_bytes_common(cls):
        yield RE_PYTHON_CONTINUATION_PROMPT, Literal.Prompt
        yield from super().long_bytes_common()

    @lexicon(re_flags=re.MULTILINE)
    def traceback(cls):
        yield r'(?=^>>>)', Literal.Prompt, -1
        yield default_action, Literal.Error


def isclassname(text):
    """Return True if text starts with uppercase letter.

    Starting underscores are skipped.

    """
    for c in text:
        if c.isupper():
            return True
        elif c != '_':
            return False
    return False