Source code for parce.lang.xml

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Parse XML.

"""

__all__ = ('Dtd', 'Xml')

import re

from parce import Language, lexicon, skip, default_action, docio, root
from parce.action import (
    Bracket, Comment, Data, Delimiter, Escape, Invalid, Keyword, Name,
    Operator, String, Text, Whitespace)
from parce.rule import (
    MATCH, TEXT, bygroup, call, dselect, ifgroup, select, words)


# source: https://www.w3.org/TR/xml/#NT-NameStartChar
RE_XML_NAME_START_CHAR = (
    '_:A-Za-z\xC0-\xD6\xD8-\xF6'
    '\xF8-\u02FF\u0370-\u037D\u037F-\u1FFF'
    '\u200C\u200D\u2070-\u218F\u2C00-\u2FEF'
    '\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
    '\U00010000-\U000EFFFF'
)
RE_XML_NAME_CHAR = '-.0-9\xB7\u0300-\u036F\u203F-\u2040' + RE_XML_NAME_START_CHAR
RE_XML_NAME = _N_ = fr'[{RE_XML_NAME_START_CHAR}][{RE_XML_NAME_CHAR}]*'
RE_XML_NAME_TOKEN = _T_ = fr'[{RE_XML_NAME_CHAR}]*'

class _XmlBase(Language):
    """Common stuff between Xml and Dtd."""
    @lexicon
    def dqstring(cls):
        yield r'&\S*?;', String.Escape
        yield default_action, String.Double
        yield r'"', String.Double.End, -1

    @lexicon
    def sqstring(cls):
        yield r'&\S*?;', String.Escape
        yield default_action, String.Single
        yield r"'", String.Single.End, -1

    @lexicon
    def comment(cls):
        yield r'-->', Comment.End, -1
        yield r'--', Comment.Invalid
        yield from cls.comment_common()

    @classmethod
    def common_defs(cls):
        """Common stuff inside DOCTYPE or ENTITY declarations etc."""
        yield from cls.find_strings()
        yield fr'%{_N_};', Name.Entity.Escape

    @classmethod
    def find_strings(cls):
        yield r'"', String.Double.Start, cls.dqstring
        yield r"'", String.Single.Start, cls.sqstring

    @classmethod
    def find_comments(cls):
        yield r'<!--', Comment.Start, cls.comment


[docs]class Xml(_XmlBase):
    """Parse XML."""
    @lexicon(re_flags=re.IGNORECASE)
    def root(cls):
        yield from cls.find_comments()
        yield r'(<!\[)(CDATA)(\[)', bygroup(Delimiter, Data.Definition, Delimiter), cls.cdata
        yield fr'(<!)(DOCTYPE)\b(?:\s*({_N_}))?', \
            bygroup(Delimiter, Keyword, Name.Tag.Definition), cls.doctype
        yield fr'(<\?)({_N_})?', bygroup(Bracket.Preprocessed.Start, Name.Tag.Preprocessed), \
            cls.processing_instruction
        tag_action = cls.tag_action()
        yield fr'(<\s*?/)\s*({_N_})\s*(>)', bygroup(Delimiter, tag_action, Delimiter), -1
        yield fr'(<)\s*({_N_})(?:\s*((?:/\s*)?>))?', \
            bygroup(Delimiter, tag_action, Delimiter), dselect(MATCH[3], {
                None: cls.attrs,        # no ">" or "/>": go to attrs
                ">": cls.tag,           # a ">": go to tag
            })                          # by default ("/>"): stay in context
        yield r'&\S*?;', Escape
        yield default_action, select(call(str.isspace, TEXT), Text, Whitespace)

    @lexicon
    def tag(cls):
        yield from cls.root()

    @lexicon
    def cdata(cls):
        yield default_action, Data
        yield r'\]\]>', Delimiter, -1

    @lexicon
    def processing_instruction(cls):
        yield fr'({_N_})\s*?(=)(?=\s*?["\'])', bygroup(Name.Attribute, Operator)
        yield from cls.find_strings()
        yield r'&\S*?;', Escape
        yield r'\?>', Bracket.Preprocessed.End, -1
        yield default_action, Text.Preprocessed

    @lexicon
    def doctype(cls):
        yield words(("SYSTEM", "PUBLIC", "NDATA")), Keyword
        yield _N_, Name
        yield from cls.common_defs()
        yield r'\[', Bracket, cls.internal_dtd
        yield r'>', Delimiter, -1

    @lexicon
    def internal_dtd(cls):
        yield r'\]', Bracket, -1
        yield from Dtd.root

    @lexicon
    def attrs(cls):
        yield _N_, Name.Attribute
        yield r'=', Operator
        yield from cls.find_strings()
        yield r'/\s*>', Delimiter, -1
        yield r'>', Delimiter, -1, cls.tag
        yield r'\s+', skip
        yield default_action, Invalid

[docs]    @classmethod
    def tag_action(cls):
        """Return the action for a tag name.

        The default implementation returns the Name.Tag standard action, but
        alternate implementations may use the TEXT placeholder.

        """
        return Name.Tag


[docs]class Dtd(_XmlBase):
    """Parse a DTD (Document Type Definition)."""
    @lexicon
    def root(cls):
        yield from cls.find_comments()
        yield fr'(<!)(ENTITY)\b(?:\s*(%))?(?:\s*({_N_}))?', \
            bygroup(Delimiter, Keyword, Keyword, Name.Entity.Definition), cls.entity
        yield fr'(<!)(ELEMENT|ATTLIST|NOTATION)\b(?:\s*({_N_}))?', \
            bygroup(Delimiter, Keyword, Name.Element.Definition), \
            dselect(MATCH[2], {"ELEMENT": cls.element, "ATTLIST": cls.attlist}, cls.notation)
        yield fr'%{_N_};', Name.Entity.Escape
        yield default_action, select(call(str.isspace, TEXT), Text, skip)

    @lexicon
    def entity(cls):
        yield words(("SYSTEM", "PUBLIC", "NDATA")), Keyword
        yield _N_, Name.Entity
        yield from cls.common_defs()
        yield r'>', Delimiter, -1

    @lexicon
    def element(cls):
        yield r'\(', Bracket, cls.element_contents
        yield words(("ANY", "EMPTY")), Name.Keyword
        yield r'[,|?+*]', Operator
        yield from cls.common_defs()
        yield r'>', Delimiter, -1

    @lexicon
    def element_contents(cls):
        """Content definition inside a <!ELEMENT > declaration."""
        yield r'#PCDATA', Name.Builtin
        yield from cls.enumerate(r'[,|?+*]', Name.Element)

    @lexicon
    def attlist(cls):
        yield words(("#REQUIRED", "#IMPLIED", "#FIXED"), suffix=r'\b'), Name.Builtin
        yield words(('CDATA', 'ID', 'IDREF', 'IDREFS', 'ENTITY', 'ENTITIES',
            'NMTOKEN', 'NMTOKENS'), prefix=r'\b', suffix=r'\b'), Name.Type
        yield r'\b(NOTATION)\b(?:\s+(\())', bygroup(Name.Type, Bracket), \
            ifgroup(2, cls.attlist_notation)
        yield _N_, Name.Attribute.Definition
        yield r'\(', Bracket, cls.attlist_enumeration
        yield from cls.common_defs()
        yield r'>', Delimiter, -1

    @lexicon
    def attlist_enumeration(cls):
        yield from cls.enumerate(r'\|', Data)

    @lexicon
    def attlist_notation(cls):
        yield from cls.enumerate(r'\|', Name.Type)

    @lexicon
    def notation(cls):
        yield words(("SYSTEM", "PUBLIC")), Keyword
        yield from cls.common_defs()
        yield r'>', Delimiter, -1

[docs]    @classmethod
    def enumerate(cls, operators=r'\|', nametype=Name.Type):
        """Find names between ( ), and operators, string and parameter entities.

        ``operators`` is the regexp for the operators, ``nametype`` the action
        for the found names.

        """
        yield r'\(', Bracket, 1
        yield r'\)', Bracket, -1
        yield operators, Operator
        yield _T_, nametype
        yield from cls.common_defs()


class XmlIO(docio.IO):
    """I/O handling for XML."""
    def find_encoding(self, text):
        """Find encoding in XML processing instruction."""
        tree = root(Xml.root, text)
        for enc in tree.query.children(Xml.processing_instruction) \
            .children.action(Name.Attribute)('encoding') \
            .right_siblings(Xml.dqstring)[0]:
            return enc.text