Source code for parce.regex

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


"""
Utility module with functions to construct or manipulate regular expressions.
"""


import operator
import re
import unicodedata


[docs]def words2regexp(words):
    """Convert the ``words`` iterable to an optimized regular expression.

    Example::

        >>> import parce.regex
        >>> parce.regex.words2regexp(['opa', 'oma', 'mama', 'papa'])
        '(?:mam|pap|o[mp])a'
        >>> parce.regex.words2regexp(['car', 'cdr', 'caar', 'cadr', 'cdar', 'cddr'])
        'c[ad]{1,2}r'

    """
    words, suffix = common_suffix(words)
    root = make_trie(words)
    r = trie_to_regexp_tuple(root)
    if suffix:
        r += (suffix,)
    return build_regexp(r)


[docs]def make_charclass(chars):
    """Return a string with adjacent characters grouped.

    Example::

        >>> parce.regex.make_charclass(('a', 'd', 'b', 'f', 'c'))
        'a-df'

    Supplying a string is also supported::

        >>> parce.regex.make_charclass("abcdefghjklmnop")
        'a-hj-p'

    Special characters are properly escaped.

    """
    buf = []
    for c in sorted(map(ord, set(chars))):
        if buf and buf[-1][1] == c - 1:
            buf[-1][1] = c
        else:
            buf.append([c, c])
    return ''.join(re.escape(chr(a)) if a == b else
                   re.escape(chr(a) + chr(b)) if a == b - 1 else
                   re.escape(chr(a)) + '-' + re.escape(chr(b))
                   for a, b in buf)


[docs]def common_suffix(words):
    """Return (words, suffix), where suffix is the common suffix.

    If there is no common suffix, words is returned unchanged, and suffix is an
    empty string. If there is a common suffix, that is chopped off the returned
    words. Example::

        >>> parce.regex.common_suffix(['opa', 'oma', 'mama', 'papa'])
        (['op', 'om', 'mam', 'pap'], 'a')

    """
    # make sure words is not a generator, othw we maybe can't iterate it twice
    if not isinstance(words, (list, tuple, set, frozenset)):
        words = tuple(words)
    suffix = []
    for s in map(set, zip(*map(reversed, words))):
        if len(s) != 1:
            break
        suffix.extend(s)
    suffix = ''.join(reversed(suffix))
    if suffix:
        chop = operator.itemgetter(slice(-len(suffix)))
        words = tuple(map(chop, words))
    return words, suffix


[docs]def to_string(expr):
    r"""Convert an unambiguous regexp to a plain string.

    If the regular expression is unambiguous and can be converted to a plain
    string, return it. Otherwise, None is returned.

    The returned string can be used with :meth:`str.find`, which would be
    faster than using :py:func:`re.search`. Examples::

        >>> parce.regex.to_string(r"a.e")
        >>> parce.regex.to_string(r"a\.e")
        'a.e'
        >>> parce.regex.to_string(r"a\ne")
        'a\ne'

    The first returns None, because the dot can match multiple characters.

    """
    if set(re.sub(r'\\(?:N\{.*?\}|.)', '', expr)) & set("^$|.()[]{}+*?"):
        return  # there are unescaped special characters like (, [, ? etc.
    # handle all escapes, there may be fails, in that case we can't use the expr as string
    pat = (r'\\(?:'
        r'x([0-9a-fA-F]{2})'        # 1 hex
        r'|([afnrtv])'              # 2 normal escaped character like \n
        r'|(0[0-7]{0,2}|[0-7]{3})'  # 3 octal
        r'|([1-9][0-9]{0,2})'       # 4 back ref, fail
        r'|u([0-9a-fA-F]{4})'       # 5 \uxxxx
        r'|U([0-9a-fA-F]{8})'       # 6 \Uxxxxxxxx
        r'|([\^\$\|\.\(\)\[\]\{\}\+\*\?\\])'  # 7 special re char that was escaped
        r'|N\{(.*?)\}'          # 8 named unicode character (since python 3.8)
        r'|)')                  # fail if stray '\' is encountered
    repl = (
        lambda s: chr(int(s, 16)),          # hex
        lambda s: chr({'a':7, 'f':12, 'n':10, 'r':13, 't':9, 'v':11}[s]),   # normal escape
        lambda s: chr(int(s, 8)),           # octal
        lambda s: None,                     # backref, fail on that
        lambda s: chr(int(s, 16)),          # \uxxxx
        lambda s: chr(int(s, 16)),          # \Uxxxxxxxx
        lambda s: s,                        # escaped re char
        lambda s: unicodedata.lookup(s),    # named unicode, can raise KeyError
    )
    def replace_escapes(m):
        i = m.lastindex     # TypeError is raised if None, or if repl returns None
        return repl[i-1](m.group(i))
    try:
        s = re.sub(pat, replace_escapes, expr)
    except (TypeError, KeyError):
        return
    assert re.fullmatch(expr, s)
    return s


[docs]def make_trie(words, reverse=False):
    """Return a dict-based radix trie structure from a list of words.

    End-points are denoted by a None key, set to True. If reverse is set to
    True, the trie is made in backward direction, from the end of the words.

    Example::

        >>> from parce.regex import make_trie
        >>> r = make_trie(["aaaa", "aaab", "aabb", "abbb", "abbbb"])
        >>> r   # output formatted nicely :-)
        {
            "a": {
                "a": {
                    "a": {
                        "a": {
                            None: True
                        },
                        "b": {
                            None: True
                        }
                    },
                    "bb": {
                        None: True
                    }
                },
                "bbb": {
                    None: True,
                    "b": {
                        None: True
                    }
                }
            }
        }

    """
    if reverse:
        chars = lambda word: word[::-1]
        add = lambda k1, k2: k2 + k1
    else:
        chars = lambda word: word
        add = lambda k1, k2: k1 + k2

    root = {}
    for w in words:
        d = root
        for c in chars(w):
            d = d.setdefault(c, {})
        d[None] = True  # end

    # merge characters that are the only child with their parents
    def merge(node):
        for key, node in node.items():
            if key:
                while len(node) == 1:
                    k, n = next(iter(node.items()))
                    if k:
                        key = add(key, k)
                        node = n
                    else:
                        break
                else:
                    node = dict(merge(node))
            yield key, node

    return dict(merge(root))


[docs]def trie_to_regexp_tuple(node, reverse=False):
    """Converts the trie node to a tuple of regular expression parts.

    A part is either a plain string expression or a frozenset instance.
    A frozenset instance denotes a group of alternative expressions, and
    consists of plain string expressions or other tuples. If None is also
    present in the frozenset, the expression is optional.

    Example::

        >>> from parce.regex import *
        >>> r = make_trie(["aaaa", "aaab", "aabb", "abbb", "abbbb"])
        >>> trie_to_regexp_tuple(r)
        (
            'a',
            frozenset({
                (
                    'a',
                    frozenset({
                        'bb',
                        (
                            'a',
                            frozenset({
                                'a',
                                'b'
                            })
                        )
                    })
                ),
                (
                    'bbb',
                    frozenset({
                        None,
                        'b'
                    })
                )
            })
        )

    This function also recognizes common suffixes within alternative
    expressions::

        >>> r = make_trie("aaaa aaba aaca abca".split())
        >>> r
        {'a':
            {'a':
                {'aa': {None: True},
                 'ba': {None: True},
                 'ca': {None: True}},
             'bca': {None: True}}}
        >>> t = trie_to_regexp_tuple(r)
        >>> t
        ('a',
         frozenset({'bca',
                    ('a',
                     frozenset({'c',
                                'a',
                                'b'}),
                     'a')}))

    (Note that the toplevel common suffix is handled by the
    :func:`common_suffix` function, which is called from :func:`words2regexp`.)


    """
    if reverse:
        combine = lambda r1, r2: r2 + r1
    else:
        combine = lambda r1, r2: r1 + r2

    if len(node) == 1:
        for k, n in node.items():
            if k:
                return combine((k,), trie_to_regexp_tuple(n, reverse))
            return ()
    else:
        seen = []
        keys = []
        groups = set()

        # group the nodes if they have the same leaf node
        for k, n in node.items():
            if k:
                try:
                    i = seen.index(n)
                except ValueError:
                    i = len(seen)
                    seen.append(n)
                    keys.append([k])
                else:
                    keys[i].append(k)
            else:
                groups.add(None)    # means optional group, may end here

        for keys, node in zip(keys, seen):
            if len(keys) == 1:
                if not any(node):
                    groups.add(keys[0])
                else:
                    groups.add(combine((keys[0],), trie_to_regexp_tuple(node, reverse)))
            else:
                if not reverse:
                    # try to optimize the keys backwards
                    r = trie_to_regexp_tuple(make_trie(keys, True), True)
                    if r == (frozenset(keys),) and not any(node):
                        groups.update(keys)
                        continue
                elif not any(node):
                    groups.update(keys)
                    continue
                else:
                    r = (frozenset(keys),)
                groups.add(combine(r, trie_to_regexp_tuple(node, reverse)))
        return groups.pop() if len(groups) == 1 else (frozenset(groups),)


[docs]def build_regexp(r):
    """Convert a tuple to a full regular expression pattern string.

    The tuple is described in the :func:`trie_to_regexp_tuple` function doc
    string.

    Example::

        >>> from parce.regex import *
        >>> r = make_trie(["aaaa", "aaab", "aabb", "abbb", "abbbb"])
        >>> t = trie_to_regexp_tuple(r)
        >>> build_regexp(t)
        'a(?:a(?:bb|a[ab])|bbbb?)'

    The main function :func:`words2regexp` uses this function internally,
    adding an extra optimization to look for a common suffix.

    """
    def get_items(r):
        """Yield regexp items from tuple r in tuples (item, mincount, maxcount).

        An item is either a string like "aa", or a two-tuple(exprs,
        tuples), where exprs is a set of plain strings, and tuples a set of
        tuples that were inside a frozenset. The mincount and maxcount are
        integers and describe the minimal or maximal required count of
        matches.

        """
        for item in r:
            if isinstance(item, str):
                yield item, 1, 1
            else:
                # item is a frozenset
                mincount = 1
                exprs = set()
                tuples = set()
                for k in item:
                    if isinstance(k, str):
                        exprs.add(k)
                    elif isinstance(k, tuple):
                        tuples.add(k)
                    elif k is None:
                        mincount = 0
                # just one expression and no subgroups?
                if len(exprs) == 1 and not tuples:
                    yield exprs.pop(), mincount, 1
                else:
                    item = (exprs, tuples)
                    # optimize for the case of only one subgroup
                    # remove otherwise empty parent group if possible
                    if not exprs and len(tuples) == 1:
                        # there is only one subexpression in the group
                        r = next(iter(tuples))
                        items = merge_items(r)
                        # if our group has no qualifier, just yield the subgroup
                        if mincount:
                            yield from items
                        # if we are optional, check if the qualifier of the subgroup
                        # can be altered. Possible when mincount <= 1.
                        elif len(items) == 1 and items[0][1] <= 1:
                            item, _, maxcount = items[0]
                            yield item, 0, maxcount
                        else:
                            yield item, mincount, 1
                    else:
                        yield item, mincount, 1

    def merge_items(r):
        """Read items-tuples such as yielded by get_items().

        Returns a list of the same items, merging where possible adjacent
        items that are the same. Every list entry is a three-tuple(item,
        mincount, maxcount); where an item is either a string like "aa", or
        a two-tuple(exprs, tuples).

        """
        items = []
        for item, mincount, maxcount in get_items(r):
            if items and items[-1][0] == item:
                items[-1][1] += mincount
                items[-1][2] += maxcount
            else:
                items.append([item, mincount, maxcount])
        return items

    items = merge_items(r)

    # now really construct the regexp string for each item
    result = []
    for item, mincount, maxcount in items:
        # qualifier to use
        if mincount == 1 and maxcount == 1:
            qualifier = ''
        elif mincount == 0 and maxcount == 1:
            qualifier = "?"
        elif mincount == maxcount:
            qualifier = "{{{0}}}".format(maxcount)
        else:
            qualifier = "{{{0},{1}}}".format(mincount or '', maxcount or '')
        # make the rx
        if isinstance(item, str):
            rx = re.escape(item)
            enclose = len(item) > 1 and qualifier
            # replace x{1,2} with xx?, looks better and is 3 chars shorter
            if len(rx) == 1 and mincount == 1 and maxcount == 2:
                rx += rx
                qualifier = "?"
        else:
            exprs, tuples = item
            # separate single characters from longer strings
            chars, strings = set(), set()
            for k in exprs:
                (chars if len(k) == 1 else strings).add(k)
            group = []
            if chars:
                if len(chars) == 1:
                    group.append(re.escape(next(iter(chars))))
                else:
                    group.append('[' + make_charclass(chars) + ']')
            if strings:
                group.extend(map(re.escape, sorted(strings)))
            if tuples:
                group.extend(map(build_regexp, tuples))
            if chars and not strings and not tuples:
                rx = group[0]
                enclose = False
            else:
                rx = '|'.join(group)
                enclose = len(items) > 1 or qualifier
        if enclose:
            rx = '(?:' + rx + ')'
        result.append(rx + qualifier)
    return ''.join(result)