Source code for parce.lang.python

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Parse Python.

"""

__all__ = ('Python', 'PythonConsole')

import re

from parce import lexicon, Language, skip, default_action, default_target
from parce.action import (
    Character, Comment, Data, Delimiter, Escape, Invalid, Keyword, Literal,
    Name, Number, Operator, String, Whitespace)
from parce.rule import (
    ARG, MATCH, TEXT, arg, bygroup, call, derive, dselect, findmember, ifarg,
    ifeq, ifgroup, ifmember, pattern, select, words)

from . import python_words


RE_PYTHON_IDENTIFIER = _I_ = r'[^\W\d]\w*'
RE_PYTHON_HORIZ_SPACE = _S_ = r'[^\S\n]'
RE_PYTHON_LINE_CONTINUATION = _N_ = r'\\\n'
_SN_ = fr'(?:{_S_}|{_N_})'


Bytes = Data.Bytes


[docs]class Python(Language): @lexicon(re_flags=re.MULTILINE) def root(cls): yield fr'^{_S_}+($|(?=#))?', ifgroup(1, Whitespace, Whitespace.Indent) yield r'@', Name.Decorator, cls.decorator yield fr'(class\b){_S_}*({_I_})', bygroup(Keyword, ifmember(MATCH[2], python_words.keywords, Invalid, Name.Class.Definition)), cls.classdef yield fr'(def\b){_S_}*({_I_})', bygroup(Keyword, ifmember(MATCH[2], python_words.keywords, Invalid, Name.Function.Definition)), cls.funcdef yield fr':(?={_S_}*(?:$|#))', Delimiter.Indent yield fr'({_I_})\s*(=)', bygroup( select(call(str.isupper, TEXT), select(call(isclassname, TEXT), Name.Variable, Name.Class), Name.Constant), Operator.Assignment) yield from cls.common()
[docs] @classmethod def common(cls): yield r'#', Comment, cls.comment yield fr'({_N_})(\s*)', bygroup(Escape, Whitespace) yield r'\[', Delimiter, cls.list yield r'\(', Delimiter, cls.tuple yield r'\{', Delimiter, cls.dict ## string literals yield from cls.find_string_literals() yield from cls.find_bytes_literals() ## numerical values yield '0[oO](?:_?[0-7])+', Number.Octal yield '0[bB](?:_?[01])+', Number.Binary yield '0[xX](?:_?[0-9a-fA-F])+', Number.Hexadecimal yield r'(?:\.\d(?:_?\d)*|\d(?:_?\d)*(?:\.(?:\d(?:_?\d)*)?)?)(?:[eE][-+]\d(?:_?\d)*)?[jJ]?', Number ## keywords, variables, functions yield words(python_words.keywords, prefix=r'\b', suffix=r'\b'), Keyword yield words(python_words.constants, prefix=r'\b', suffix=r'\b'), Name.Constant yield fr'\b(self|cls)\b(?:{_SN_}*([\[\(]))?', Name.Variable.Special, \ dselect(MATCH[2], {'(': cls.call, '[': cls.item}) # method, class or attribute (keywords after a . are also caught) yield fr'(\.){_SN_}*\b({_I_})\b(?:{_SN_}*([\[\(]))?', \ bygroup( Delimiter, ifmember(MATCH[2], python_words.keywords, Keyword, dselect(MATCH[3], {'(': select(call(isclassname, TEXT), Name.Method, Name.Class)}, select(call(str.isupper, TEXT), select(call(isclassname, TEXT), Name.Attribute, Name.Class), Name.Constant))), Delimiter), \ dselect(MATCH[3], {'(': cls.call, '[': cls.item}) # function, class or variable yield fr'\b({_I_})\b(?:{_SN_}*([\[\(]))?', \ bygroup( findmember(MATCH[1], ((python_words.builtins, Name.Builtin), (python_words.exceptions, Name.Exception)), select(call(str.isupper, TEXT), select(call(isclassname, TEXT), dselect(MATCH[2], {'(': Name.Function}, Name.Variable), Name.Class), Name.Constant)), Delimiter), \ dselect(MATCH[2], {'(': cls.call, '[': cls.item}) ## delimiters, operators yield r'\.\.\.', Delimiter.Special.Ellipsis yield r'(?:\*\*|//|<<|>>|[-+*/%@&|^:])?=', Operator.Assignment yield r'\*\*|//|<<|>>|[<>=!]=|[-+*/%@&|^~<>]', Operator yield r'[.;,:]', Delimiter
@lexicon(re_flags=re.MULTILINE) def decorator(cls): """A decorator.""" yield _I_, Name.Decorator yield r'\[', Delimiter, cls.item yield r'\(', Delimiter, cls.call yield r'\.', Delimiter yield '$', None, -1 yield r'\\\n', Escape yield r'#', Comment, -1, cls.comment @lexicon def funcdef(cls): """A function definition.""" yield r'\(', Delimiter, cls.signature yield r'->', Delimiter.Annotation yield r':', Delimiter.Indent, -1 yield r'#', Comment, -1, cls.comment yield from cls.common() @lexicon def signature(cls): """A function signature.""" yield r'\)', Delimiter, -1 yield r':', Delimiter.Annotation yield from cls.common() @lexicon def classdef(cls): """A class definition.""" yield r'\(', Delimiter, cls.bases yield ":", Delimiter.Indent, -1 yield r'#', Comment, -1, cls.comment yield from cls.common() @lexicon def bases(cls): """The base classes in a class definition.""" yield r'\)', Delimiter, -1 yield from cls.common() ## ------ expressions ----------- @lexicon def item(cls): """Stuff between xxx[ and ] (getitem).""" yield r'\]', Delimiter, -1 yield from cls.common() @lexicon def call(cls): """Stuff between xxx( and ) (call).""" yield r'\)', Delimiter, -1 yield from cls.common() ## ----- item types ------------- @lexicon def list(cls): yield r'\]', Delimiter, -1 yield ',', Delimiter yield from cls.common() @lexicon def tuple(cls): yield r'\)', Delimiter, -1 yield ',', Delimiter yield from cls.common() @lexicon def dict(cls): yield r'\}', Delimiter, -1 yield '[,:]', Delimiter yield from cls.common() ## ------- strings --------------
[docs] @classmethod def find_string_literals(cls, target=None, allow_newlines=None): """Find string literals.""" # short strings not closed on the same line are invalid yield r'''[rRuUfF]{,2}["']$''', String.Invalid if target is None: target = cls.string(allow_newlines) # long strings yield r'(\b[rR])("""|'r"''')", \ bygroup(String.Prefix, String.Start), \ target, derive(cls.long_string_raw, MATCH[2]) yield r'(\b(?:[fF][rR])|(?:[rR][fF]))("""|'r"''')", \ bygroup(String.Prefix, String.Start), \ target, derive(cls.long_string_raw_format, MATCH[2]) yield r'(\b[uU])?("""|'r"''')", \ bygroup(String.Prefix, String.Start), \ target, derive(cls.long_string, MATCH[2]) yield r'(\b[fF])("""|'r"''')", \ bygroup(String.Prefix, String.Start), \ target, derive(cls.long_string_format, MATCH[2]) # short strings yield r'''(\b[rR])(['"])''', \ bygroup(String.Prefix, String.Start), \ target, derive(cls.short_string_raw, MATCH[2]) yield r'''(\b(?:[fF][rR])|(?:[rR][fF]))(['"])''', \ bygroup(String.Prefix, String.Start), \ target, derive(cls.short_string_raw_format, MATCH[2]) yield r'''(\b[uU])?(['"])''', \ bygroup(String.Prefix, String.Start), \ target, derive(cls.short_string, MATCH[2]) yield r'''(\b[fF])(['"])''', \ bygroup(String.Prefix, String.Start), \ target, derive(cls.short_string_format, MATCH[2])
@lexicon def string(cls): """All strings end here, check [slice] notation and concatenated literals.""" yield _N_, Escape yield ifarg(r'\s+', r'[ \t]+'), skip # allow newline inside arglists, tuples, etc yield from cls.find_string_literals(0) yield r'\[', Delimiter, cls.item yield default_target, -1 @lexicon(re_flags=re.MULTILINE) def short_string(cls): yield from cls.string_escape() yield from cls.short_string_common() @lexicon(re_flags=re.MULTILINE) def short_string_raw(cls): yield from cls.short_string_raw_common() @lexicon(re_flags=re.MULTILINE) def short_string_format(cls): yield from cls.string_formatstring() yield from cls.string_escape() yield from cls.short_string_common() @lexicon(re_flags=re.MULTILINE) def short_string_raw_format(cls): yield from cls.string_formatstring() yield from cls.short_string_raw_common()
[docs] @classmethod def short_string_common(cls): yield arg(), String.End, -1 yield pattern(ifeq(ARG, "'", r"[^']*?$", r'[^"]*?$')), String.Invalid, -1 yield default_action, String
[docs] @classmethod def short_string_raw_common(cls): yield arg(), String.End, -1 yield r'\\\\', String yield pattern(ifeq(ARG, "'", fr"([^\\']*?|\\'{_S_}*)$", fr'([^\\"]*?|\\"{_S_}*)$')), String.Invalid, -1 yield arg(prefix=r'\\'), String # escape quote, but the \ remains yield default_action, String
@lexicon def long_string(cls): yield from cls.string_escape() yield from cls.long_string_common() @lexicon def long_string_raw(cls): yield arg(prefix=r'\\'), String # escape quote, but the \ remains yield from cls.long_string_common() @lexicon def long_string_format(cls): yield from cls.string_formatstring() yield from cls.string_escape() yield from cls.long_string_common() @lexicon def long_string_raw_format(cls): yield arg(prefix=r'\\'), String # escape quote, but the \ remains yield from cls.string_formatstring() yield from cls.long_string_common()
[docs] @classmethod def long_string_common(cls): yield arg(), String.End, -1 yield default_action, String
# ------ stuff common for short and long strings ---------
[docs] @classmethod def string_escape(cls): yield from cls.bytes_escape(String.Escape) yield r'\\N\{[^\}]+\}', String.Escape yield r'\\u[0-9a-fA-F]{4}', String.Escape yield r'\\U[0-9a-fA-F]{8}', String.Escape
[docs] @classmethod def string_formatstring(cls): yield r'\{\{|\}\}', String.Escape yield r'\{', Delimiter.Template, cls.string_format_expr
@lexicon def string_format_expr(cls): yield '![sra]', Character yield ':', Delimiter, cls.string_format_spec yield r'\}', Delimiter.Template, -1 yield from cls.common() @lexicon def string_format_spec(cls): yield r'\{', Delimiter, cls.string_format_spec_nested yield r'\}', Delimiter.Template, -2 yield from cls.common() # TODO maybe really parse format strings @lexicon def string_format_spec_nested(cls): yield r'\}', Delimiter, -1 yield from cls.common() # ----------------- bytes --------------------
[docs] @classmethod def find_bytes_literals(cls, target=None, allow_newlines=None): """Find bytes literals.""" # short bytes not closed on the same line are invalid yield r'''[rRbB]{,2}["']$''', Bytes.Invalid if target is None: target = cls.bytes(allow_newlines) # long bytes yield r'(\b(?:[bB][rR])|(?:[rR][bB]))("""|'r"''')", \ bygroup(Bytes.Prefix, Bytes.Start), \ target, derive(cls.long_bytes_raw, MATCH[2]) yield r'(\b[bB])("""|'r"''')", \ bygroup(Bytes.Prefix, Bytes.Start), \ target, derive(cls.long_bytes, MATCH[2]) # short bytes yield r'''(\b(?:[bB][rR])|(?:[rR][bB]))(['"])''', \ bygroup(Bytes.Prefix, Bytes.Start), \ target, derive(cls.short_bytes_raw, MATCH[2]) yield r'''(\b[bB])(['"])''', \ bygroup(Bytes.Prefix, Bytes.Start), \ target, derive(cls.short_bytes, MATCH[2])
@lexicon def bytes(cls): """All bytes end here, check [slice] notation and concatenated literals.""" yield _N_, Escape yield ifarg(r'\s+', r'[ \t]+'), skip # allow newline inside arglists, tuples, etc yield from cls.find_bytes_literals(0) yield r'\[', Delimiter, cls.item yield default_target, -1 @lexicon(re_flags=re.MULTILINE) def short_bytes(cls): yield from cls.bytes_escape() yield from cls.short_bytes_common() @lexicon(re_flags=re.MULTILINE) def short_bytes_raw(cls): yield from cls.short_bytes_raw_common() @lexicon def long_bytes(cls): yield from cls.bytes_escape() yield from cls.long_bytes_common() @lexicon def long_bytes_raw(cls): yield from cls.long_bytes_common()
[docs] @classmethod def short_bytes_common(cls): yield arg(), Bytes.End, -1 yield pattern(ifeq(ARG, "'", r"[^']*?$", r'[^"]*?$')), Bytes.Invalid, -1 yield default_action, Bytes
[docs] @classmethod def short_bytes_raw_common(cls): yield r'\\\\', Bytes yield pattern(ifeq(ARG, "'", fr"([^\\']*?|\\'{_S_}*)$", fr'([^\\"]*?|\\"{_S_}*)$')), Bytes.Invalid, -1 yield arg(prefix=r'\\'), Bytes # escape quote, but the \ remains yield from cls.long_bytes_common()
[docs] @classmethod def long_bytes_common(cls): yield arg(), Bytes.End, -1 yield default_action, Bytes
[docs] @classmethod def bytes_escape(cls, action=Bytes.Escape): yield r'''\\[\n\\'"abfnrtv]''', action yield r'\\\d{1,3}', action yield r'\\x[0-9a-fA-F]{2}', action
## ------- comments ------------- @lexicon(re_flags=re.MULTILINE) def comment(cls): yield from cls.comment_common() yield r'$', Comment, -1
RE_PYTHON_PROMPT = r'(?:(?<=\n)|^)(?:>>>|\.\.\.)(?: |$|(?=\n))' RE_PYTHON_NO_PROMPT = r'^((?!^(>>>|\.\.\.)).)*$' RE_PYTHON_CONTINUATION_PROMPT = r'(?:(?<=\n)|^)\.\.\.(?: |$|(?=\n))'
[docs]class PythonConsole(Python): """Python console input and output with prompt.""" @lexicon(re_flags=re.MULTILINE) def root(cls): yield r'(?=^Traceback \(most recent call last\):)', Literal.Error, cls.traceback yield RE_PYTHON_NO_PROMPT, Literal.Output yield from super().root
[docs] @classmethod def common(cls): yield RE_PYTHON_PROMPT, Literal.Prompt yield from super().common()
[docs] @classmethod def long_string_common(cls): yield RE_PYTHON_CONTINUATION_PROMPT, Literal.Prompt yield from super().long_string_common()
[docs] @classmethod def long_bytes_common(cls): yield RE_PYTHON_CONTINUATION_PROMPT, Literal.Prompt yield from super().long_bytes_common()
@lexicon(re_flags=re.MULTILINE) def traceback(cls): yield r'(?=^>>>)', Literal.Prompt, -1 yield default_action, Literal.Error
def isclassname(text): """Return True if text starts with uppercase letter. Starting underscores are skipped. """ for c in text: if c.isupper(): return True elif c != '_': return False return False