Source code for parce.lang.xml

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Parse XML.

"""

__all__ = ('Dtd', 'Xml')

import re

from parce import Language, lexicon, skip, default_action, docio, root
from parce.action import (
    Bracket, Comment, Data, Delimiter, Escape, Invalid, Keyword, Name,
    Operator, String, Text, Whitespace)
from parce.rule import (
    MATCH, TEXT, bygroup, call, dselect, ifgroup, select, words)


# source: https://www.w3.org/TR/xml/#NT-NameStartChar
RE_XML_NAME_START_CHAR = (
    '_:A-Za-z\xC0-\xD6\xD8-\xF6'
    '\xF8-\u02FF\u0370-\u037D\u037F-\u1FFF'
    '\u200C\u200D\u2070-\u218F\u2C00-\u2FEF'
    '\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
    '\U00010000-\U000EFFFF'
)
RE_XML_NAME_CHAR = '-.0-9\xB7\u0300-\u036F\u203F-\u2040' + RE_XML_NAME_START_CHAR
RE_XML_NAME = _N_ = fr'[{RE_XML_NAME_START_CHAR}][{RE_XML_NAME_CHAR}]*'
RE_XML_NAME_TOKEN = _T_ = fr'[{RE_XML_NAME_CHAR}]*'

class _XmlBase(Language):
    """Common stuff between Xml and Dtd."""
    @lexicon
    def dqstring(cls):
        yield r'&\S*?;', String.Escape
        yield default_action, String.Double
        yield r'"', String.Double.End, -1

    @lexicon
    def sqstring(cls):
        yield r'&\S*?;', String.Escape
        yield default_action, String.Single
        yield r"'", String.Single.End, -1

    @lexicon
    def comment(cls):
        yield r'-->', Comment.End, -1
        yield r'--', Comment.Invalid
        yield from cls.comment_common()

    @classmethod
    def common_defs(cls):
        """Common stuff inside DOCTYPE or ENTITY declarations etc."""
        yield from cls.find_strings()
        yield fr'%{_N_};', Name.Entity.Escape

    @classmethod
    def find_strings(cls):
        yield r'"', String.Double.Start, cls.dqstring
        yield r"'", String.Single.Start, cls.sqstring

    @classmethod
    def find_comments(cls):
        yield r'<!--', Comment.Start, cls.comment


[docs]class Xml(_XmlBase): """Parse XML.""" @lexicon(re_flags=re.IGNORECASE) def root(cls): yield from cls.find_comments() yield r'(<!\[)(CDATA)(\[)', bygroup(Delimiter, Data.Definition, Delimiter), cls.cdata yield fr'(<!)(DOCTYPE)\b(?:\s*({_N_}))?', \ bygroup(Delimiter, Keyword, Name.Tag.Definition), cls.doctype yield fr'(<\?)({_N_})?', bygroup(Bracket.Preprocessed.Start, Name.Tag.Preprocessed), \ cls.processing_instruction tag_action = cls.tag_action() yield fr'(<\s*?/)\s*({_N_})\s*(>)', bygroup(Delimiter, tag_action, Delimiter), -1 yield fr'(<)\s*({_N_})(?:\s*((?:/\s*)?>))?', \ bygroup(Delimiter, tag_action, Delimiter), dselect(MATCH[3], { None: cls.attrs, # no ">" or "/>": go to attrs ">": cls.tag, # a ">": go to tag }) # by default ("/>"): stay in context yield r'&\S*?;', Escape yield default_action, select(call(str.isspace, TEXT), Text, Whitespace) @lexicon def tag(cls): yield from cls.root() @lexicon def cdata(cls): yield default_action, Data yield r'\]\]>', Delimiter, -1 @lexicon def processing_instruction(cls): yield fr'({_N_})\s*?(=)(?=\s*?["\'])', bygroup(Name.Attribute, Operator) yield from cls.find_strings() yield r'&\S*?;', Escape yield r'\?>', Bracket.Preprocessed.End, -1 yield default_action, Text.Preprocessed @lexicon def doctype(cls): yield words(("SYSTEM", "PUBLIC", "NDATA")), Keyword yield _N_, Name yield from cls.common_defs() yield r'\[', Bracket, cls.internal_dtd yield r'>', Delimiter, -1 @lexicon def internal_dtd(cls): yield r'\]', Bracket, -1 yield from Dtd.root @lexicon def attrs(cls): yield _N_, Name.Attribute yield r'=', Operator yield from cls.find_strings() yield r'/\s*>', Delimiter, -1 yield r'>', Delimiter, -1, cls.tag yield r'\s+', skip yield default_action, Invalid
[docs] @classmethod def tag_action(cls): """Return the action for a tag name. The default implementation returns the Name.Tag standard action, but alternate implementations may use the TEXT placeholder. """ return Name.Tag
[docs]class Dtd(_XmlBase): """Parse a DTD (Document Type Definition).""" @lexicon def root(cls): yield from cls.find_comments() yield fr'(<!)(ENTITY)\b(?:\s*(%))?(?:\s*({_N_}))?', \ bygroup(Delimiter, Keyword, Keyword, Name.Entity.Definition), cls.entity yield fr'(<!)(ELEMENT|ATTLIST|NOTATION)\b(?:\s*({_N_}))?', \ bygroup(Delimiter, Keyword, Name.Element.Definition), \ dselect(MATCH[2], {"ELEMENT": cls.element, "ATTLIST": cls.attlist}, cls.notation) yield fr'%{_N_};', Name.Entity.Escape yield default_action, select(call(str.isspace, TEXT), Text, skip) @lexicon def entity(cls): yield words(("SYSTEM", "PUBLIC", "NDATA")), Keyword yield _N_, Name.Entity yield from cls.common_defs() yield r'>', Delimiter, -1 @lexicon def element(cls): yield r'\(', Bracket, cls.element_contents yield words(("ANY", "EMPTY")), Name.Keyword yield r'[,|?+*]', Operator yield from cls.common_defs() yield r'>', Delimiter, -1 @lexicon def element_contents(cls): """Content definition inside a <!ELEMENT > declaration.""" yield r'#PCDATA', Name.Builtin yield from cls.enumerate(r'[,|?+*]', Name.Element) @lexicon def attlist(cls): yield words(("#REQUIRED", "#IMPLIED", "#FIXED"), suffix=r'\b'), Name.Builtin yield words(('CDATA', 'ID', 'IDREF', 'IDREFS', 'ENTITY', 'ENTITIES', 'NMTOKEN', 'NMTOKENS'), prefix=r'\b', suffix=r'\b'), Name.Type yield r'\b(NOTATION)\b(?:\s+(\())', bygroup(Name.Type, Bracket), \ ifgroup(2, cls.attlist_notation) yield _N_, Name.Attribute.Definition yield r'\(', Bracket, cls.attlist_enumeration yield from cls.common_defs() yield r'>', Delimiter, -1 @lexicon def attlist_enumeration(cls): yield from cls.enumerate(r'\|', Data) @lexicon def attlist_notation(cls): yield from cls.enumerate(r'\|', Name.Type) @lexicon def notation(cls): yield words(("SYSTEM", "PUBLIC")), Keyword yield from cls.common_defs() yield r'>', Delimiter, -1
[docs] @classmethod def enumerate(cls, operators=r'\|', nametype=Name.Type): """Find names between ( ), and operators, string and parameter entities. ``operators`` is the regexp for the operators, ``nametype`` the action for the found names. """ yield r'\(', Bracket, 1 yield r'\)', Bracket, -1 yield operators, Operator yield _T_, nametype yield from cls.common_defs()
class XmlIO(docio.IO): """I/O handling for XML.""" def find_encoding(self, text): """Find encoding in XML processing instruction.""" tree = root(Xml.root, text) for enc in tree.query.children(Xml.processing_instruction) \ .children.action(Name.Attribute)('encoding') \ .right_siblings(Xml.dqstring)[0]: return enc.text