# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2019-2020 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
Utility module with functions to construct or manipulate regular expressions.
"""
import operator
import re
import unicodedata
[docs]def words2regexp(words):
"""Convert the ``words`` iterable to an optimized regular expression.
Example::
>>> import parce.regex
>>> parce.regex.words2regexp(['opa', 'oma', 'mama', 'papa'])
'(?:mam|pap|o[mp])a'
>>> parce.regex.words2regexp(['car', 'cdr', 'caar', 'cadr', 'cdar', 'cddr'])
'c[ad]{1,2}r'
"""
words, suffix = common_suffix(words)
root = make_trie(words)
r = trie_to_regexp_tuple(root)
if suffix:
r += (suffix,)
return build_regexp(r)
[docs]def make_charclass(chars):
"""Return a string with adjacent characters grouped.
Example::
>>> parce.regex.make_charclass(('a', 'd', 'b', 'f', 'c'))
'a-df'
Supplying a string is also supported::
>>> parce.regex.make_charclass("abcdefghjklmnop")
'a-hj-p'
Special characters are properly escaped.
"""
buf = []
for c in sorted(map(ord, set(chars))):
if buf and buf[-1][1] == c - 1:
buf[-1][1] = c
else:
buf.append([c, c])
return ''.join(re.escape(chr(a)) if a == b else
re.escape(chr(a) + chr(b)) if a == b - 1 else
re.escape(chr(a)) + '-' + re.escape(chr(b))
for a, b in buf)
[docs]def common_suffix(words):
"""Return (words, suffix), where suffix is the common suffix.
If there is no common suffix, words is returned unchanged, and suffix is an
empty string. If there is a common suffix, that is chopped off the returned
words. Example::
>>> parce.regex.common_suffix(['opa', 'oma', 'mama', 'papa'])
(['op', 'om', 'mam', 'pap'], 'a')
"""
# make sure words is not a generator, othw we maybe can't iterate it twice
if not isinstance(words, (list, tuple, set, frozenset)):
words = tuple(words)
suffix = []
for s in map(set, zip(*map(reversed, words))):
if len(s) != 1:
break
suffix.extend(s)
suffix = ''.join(reversed(suffix))
if suffix:
chop = operator.itemgetter(slice(-len(suffix)))
words = tuple(map(chop, words))
return words, suffix
[docs]def to_string(expr):
r"""Convert an unambiguous regexp to a plain string.
If the regular expression is unambiguous and can be converted to a plain
string, return it. Otherwise, None is returned.
The returned string can be used with :meth:`str.find`, which would be
faster than using :py:func:`re.search`. Examples::
>>> parce.regex.to_string(r"a.e")
>>> parce.regex.to_string(r"a\.e")
'a.e'
>>> parce.regex.to_string(r"a\ne")
'a\ne'
The first returns None, because the dot can match multiple characters.
"""
if set(re.sub(r'\\(?:N\{.*?\}|.)', '', expr)) & set("^$|.()[]{}+*?"):
return # there are unescaped special characters like (, [, ? etc.
# handle all escapes, there may be fails, in that case we can't use the expr as string
pat = (r'\\(?:'
r'x([0-9a-fA-F]{2})' # 1 hex
r'|([afnrtv])' # 2 normal escaped character like \n
r'|(0[0-7]{0,2}|[0-7]{3})' # 3 octal
r'|([1-9][0-9]{0,2})' # 4 back ref, fail
r'|u([0-9a-fA-F]{4})' # 5 \uxxxx
r'|U([0-9a-fA-F]{8})' # 6 \Uxxxxxxxx
r'|([\^\$\|\.\(\)\[\]\{\}\+\*\?\\])' # 7 special re char that was escaped
r'|N\{(.*?)\}' # 8 named unicode character (since python 3.8)
r'|)') # fail if stray '\' is encountered
repl = (
lambda s: chr(int(s, 16)), # hex
lambda s: chr({'a':7, 'f':12, 'n':10, 'r':13, 't':9, 'v':11}[s]), # normal escape
lambda s: chr(int(s, 8)), # octal
lambda s: None, # backref, fail on that
lambda s: chr(int(s, 16)), # \uxxxx
lambda s: chr(int(s, 16)), # \Uxxxxxxxx
lambda s: s, # escaped re char
lambda s: unicodedata.lookup(s), # named unicode, can raise KeyError
)
def replace_escapes(m):
i = m.lastindex # TypeError is raised if None, or if repl returns None
return repl[i-1](m.group(i))
try:
s = re.sub(pat, replace_escapes, expr)
except (TypeError, KeyError):
return
assert re.fullmatch(expr, s)
return s
[docs]def make_trie(words, reverse=False):
"""Return a dict-based radix trie structure from a list of words.
End-points are denoted by a None key, set to True. If reverse is set to
True, the trie is made in backward direction, from the end of the words.
Example::
>>> from parce.regex import make_trie
>>> r = make_trie(["aaaa", "aaab", "aabb", "abbb", "abbbb"])
>>> r # output formatted nicely :-)
{
"a": {
"a": {
"a": {
"a": {
None: True
},
"b": {
None: True
}
},
"bb": {
None: True
}
},
"bbb": {
None: True,
"b": {
None: True
}
}
}
}
"""
if reverse:
chars = lambda word: word[::-1]
add = lambda k1, k2: k2 + k1
else:
chars = lambda word: word
add = lambda k1, k2: k1 + k2
root = {}
for w in words:
d = root
for c in chars(w):
d = d.setdefault(c, {})
d[None] = True # end
# merge characters that are the only child with their parents
def merge(node):
for key, node in node.items():
if key:
while len(node) == 1:
k, n = next(iter(node.items()))
if k:
key = add(key, k)
node = n
else:
break
else:
node = dict(merge(node))
yield key, node
return dict(merge(root))
[docs]def trie_to_regexp_tuple(node, reverse=False):
"""Converts the trie node to a tuple of regular expression parts.
A part is either a plain string expression or a frozenset instance.
A frozenset instance denotes a group of alternative expressions, and
consists of plain string expressions or other tuples. If None is also
present in the frozenset, the expression is optional.
Example::
>>> from parce.regex import *
>>> r = make_trie(["aaaa", "aaab", "aabb", "abbb", "abbbb"])
>>> trie_to_regexp_tuple(r)
(
'a',
frozenset({
(
'a',
frozenset({
'bb',
(
'a',
frozenset({
'a',
'b'
})
)
})
),
(
'bbb',
frozenset({
None,
'b'
})
)
})
)
This function also recognizes common suffixes within alternative
expressions::
>>> r = make_trie("aaaa aaba aaca abca".split())
>>> r
{'a':
{'a':
{'aa': {None: True},
'ba': {None: True},
'ca': {None: True}},
'bca': {None: True}}}
>>> t = trie_to_regexp_tuple(r)
>>> t
('a',
frozenset({'bca',
('a',
frozenset({'c',
'a',
'b'}),
'a')}))
(Note that the toplevel common suffix is handled by the
:func:`common_suffix` function, which is called from :func:`words2regexp`.)
"""
if reverse:
combine = lambda r1, r2: r2 + r1
else:
combine = lambda r1, r2: r1 + r2
if len(node) == 1:
for k, n in node.items():
if k:
return combine((k,), trie_to_regexp_tuple(n, reverse))
return ()
else:
seen = []
keys = []
groups = set()
# group the nodes if they have the same leaf node
for k, n in node.items():
if k:
try:
i = seen.index(n)
except ValueError:
i = len(seen)
seen.append(n)
keys.append([k])
else:
keys[i].append(k)
else:
groups.add(None) # means optional group, may end here
for keys, node in zip(keys, seen):
if len(keys) == 1:
if not any(node):
groups.add(keys[0])
else:
groups.add(combine((keys[0],), trie_to_regexp_tuple(node, reverse)))
else:
if not reverse:
# try to optimize the keys backwards
r = trie_to_regexp_tuple(make_trie(keys, True), True)
if r == (frozenset(keys),) and not any(node):
groups.update(keys)
continue
elif not any(node):
groups.update(keys)
continue
else:
r = (frozenset(keys),)
groups.add(combine(r, trie_to_regexp_tuple(node, reverse)))
return groups.pop() if len(groups) == 1 else (frozenset(groups),)
[docs]def build_regexp(r):
"""Convert a tuple to a full regular expression pattern string.
The tuple is described in the :func:`trie_to_regexp_tuple` function doc
string.
Example::
>>> from parce.regex import *
>>> r = make_trie(["aaaa", "aaab", "aabb", "abbb", "abbbb"])
>>> t = trie_to_regexp_tuple(r)
>>> build_regexp(t)
'a(?:a(?:bb|a[ab])|bbbb?)'
The main function :func:`words2regexp` uses this function internally,
adding an extra optimization to look for a common suffix.
"""
def get_items(r):
"""Yield regexp items from tuple r in tuples (item, mincount, maxcount).
An item is either a string like "aa", or a two-tuple(exprs,
tuples), where exprs is a set of plain strings, and tuples a set of
tuples that were inside a frozenset. The mincount and maxcount are
integers and describe the minimal or maximal required count of
matches.
"""
for item in r:
if isinstance(item, str):
yield item, 1, 1
else:
# item is a frozenset
mincount = 1
exprs = set()
tuples = set()
for k in item:
if isinstance(k, str):
exprs.add(k)
elif isinstance(k, tuple):
tuples.add(k)
elif k is None:
mincount = 0
# just one expression and no subgroups?
if len(exprs) == 1 and not tuples:
yield exprs.pop(), mincount, 1
else:
item = (exprs, tuples)
# optimize for the case of only one subgroup
# remove otherwise empty parent group if possible
if not exprs and len(tuples) == 1:
# there is only one subexpression in the group
r = next(iter(tuples))
items = merge_items(r)
# if our group has no qualifier, just yield the subgroup
if mincount:
yield from items
# if we are optional, check if the qualifier of the subgroup
# can be altered. Possible when mincount <= 1.
elif len(items) == 1 and items[0][1] <= 1:
item, _, maxcount = items[0]
yield item, 0, maxcount
else:
yield item, mincount, 1
else:
yield item, mincount, 1
def merge_items(r):
"""Read items-tuples such as yielded by get_items().
Returns a list of the same items, merging where possible adjacent
items that are the same. Every list entry is a three-tuple(item,
mincount, maxcount); where an item is either a string like "aa", or
a two-tuple(exprs, tuples).
"""
items = []
for item, mincount, maxcount in get_items(r):
if items and items[-1][0] == item:
items[-1][1] += mincount
items[-1][2] += maxcount
else:
items.append([item, mincount, maxcount])
return items
items = merge_items(r)
# now really construct the regexp string for each item
result = []
for item, mincount, maxcount in items:
# qualifier to use
if mincount == 1 and maxcount == 1:
qualifier = ''
elif mincount == 0 and maxcount == 1:
qualifier = "?"
elif mincount == maxcount:
qualifier = "{{{0}}}".format(maxcount)
else:
qualifier = "{{{0},{1}}}".format(mincount or '', maxcount or '')
# make the rx
if isinstance(item, str):
rx = re.escape(item)
enclose = len(item) > 1 and qualifier
# replace x{1,2} with xx?, looks better and is 3 chars shorter
if len(rx) == 1 and mincount == 1 and maxcount == 2:
rx += rx
qualifier = "?"
else:
exprs, tuples = item
# separate single characters from longer strings
chars, strings = set(), set()
for k in exprs:
(chars if len(k) == 1 else strings).add(k)
group = []
if chars:
if len(chars) == 1:
group.append(re.escape(next(iter(chars))))
else:
group.append('[' + make_charclass(chars) + ']')
if strings:
group.extend(map(re.escape, sorted(strings)))
if tuples:
group.extend(map(build_regexp, tuples))
if chars and not strings and not tuples:
rx = group[0]
enclose = False
else:
rx = '|'.join(group)
enclose = len(items) > 1 or qualifier
if enclose:
rx = '(?:' + rx + ')'
result.append(rx + qualifier)
return ''.join(result)