Source code for pattern_clustering.regexp

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Functions used to craft automata from regular expressions."""

__author__ = "Marc-Olivier Buob"
__maintainer__ = "Marc-Olivier Buob"
__email__ = "marc-olivier.buob@nokia-bell-labs.com"
__copyright__ = "Copyright (C) 2020, Nokia"
__license__ = "BSD-3"

from string import printable
from pybgl.automaton import Automaton, add_edge, set_final
from pybgl.regexp import compile_dfa


[docs]def make_re_hex_digit(lower_case :bool = True, upper_case :bool = True) -> str:
    """
    Builds the regular expression catching hexadecimal values.

    Args:
        lower_case (bool): Pass ``False`` to discard lower case values.
        upper_case (bool): Pass ``False`` to discard upper case values.
    Returns:
        The string storing the regular expression.
    """
    return r"[0-9%s%s]" % (
        "a-f" if lower_case else "",
        "A-F" if upper_case else ""
    )


[docs]def make_re_ipv6(lower_case :bool = True, upper_case :bool = True) -> str:
    """
    Builds the regular expression catching IPv6 addresses.

    Note this is not an exact match contrary to ``make_re_ipv6_strict``, but
    the resulting automaton is significantly faster (and should be accurate
    enough for most of practical use cases).

    Args:
        lower_case (bool): Pass ``False`` to discard lower case values.
        upper_case (bool): Pass ``False`` to discard upper case values.
    Returns:
        The string storing the regular expression.
    """
    assert lower_case or upper_case
    hex4 = "[%s%s0-9]{0,4}" % (
        "a-f" if lower_case else "",
        "A-F" if upper_case else ""
    )
    ipv6_sep = ":"
    return "((" + hex4 + ")?(" + ipv6_sep + hex4 + ")+" + ipv6_sep + hex4 + ")"


# Avoid to use it (long to compile, long to compute language_density)
[docs]def make_re_ipv6_strict(*args, **kwargs) -> str:
    """
    Builds the regular expression catching IPv6 addresses.

    Args:
        \*args: see ``make_re_hex_digit``.
        \*kwargs: see ``make_re_hex_digit``.
    Returns:
        The string storing the regular expression.
    """
    re_seg = make_re_hex_digit(*args, **kwargs) + r"{1,4}"
    return "(%s)" % "|".join([
        "(" + re_seg + ":){7,7}" + re_seg,  # 1:2:3:4:5:6:7:8
        "(" + re_seg + ":){1,7}:",  # 1::                                 1:2:3:4:5:6:7::
        "(" + re_seg + ":){1,6}:" + re_seg,  # 1::8               1:2:3:4:5:6::8   1:2:3:4:5:6::8
        "(" + re_seg + ":){1,5}(:" + re_seg + "){1,2}",  # 1::7:8             1:2:3:4:5::7:8   1:2:3:4:5::8
        "(" + re_seg + ":){1,4}(:" + re_seg + "){1,3}",  # 1::6:7:8           1:2:3:4::6:7:8   1:2:3:4::8
        "(" + re_seg + ":){1,3}(:" + re_seg + "){1,4}",  # 1::5:6:7:8         1:2:3::5:6:7:8   1:2:3::8
        "(" + re_seg + ":){1,2}(:" + re_seg + "){1,5}",  # 1::4:5:6:7:8       1:2::4:5:6:7:8   1:2::8
        re_seg + ":((:" + re_seg + "){1,6})",  # 1::3:4:5:6:7:8     1::3:4:5:6:7:8   1::8
        ":((:" + re_seg + "){1,7}|:)",  # ::2:3:4:5:6:7:8    ::2:3:4:5:6:7:8  ::8       ::
        # fe80::7:8%eth0 fe80::7:8%1  (link-local IPv6 addresses with zone index)
        "fe80:(:" + re_seg + "){0,4}%[0-9a-zA-Z]{1,}",
        # ::255.255.255.255  ::ffff:255.255.255.255  ::ffff:0:255.255.255.255
        # (IPv4-mapped IPv6 addresses and IPv4-translated addresses)
        "::(ffff(:0{1,4}){0,1}:){0,1}" + RE_IPV4,
        #  2001:db8:3:4::192.0.2.33  64:ff9b::192.0.2.33 (IPv4-Embedded IPv6 Address)
        "(re_seg:){1,4}:" + RE_IPV4
    ])


RE_0_32 = r"(3[0-2]|[0-2]?[0-9])"
RE_0_128 = r"(12[0-8]|1[0-1][0-9]|([0-9]{1,2}))"
RE_0_255 = r"(25[0-5]|(2[0-4]|[0-1]{0,1}[0-9]){0,1}[0-9])"
RE_ALNUM = r"[a-zA-Z0-9]+"
RE_ANY = r"(\S|\s)+"  # The metacharacter "." is not yet supported in pybgl
RE_BOOL = r"0|1"
RE_DELIMITER = r"[-]+|[+]+|[=]+|[@]+|[~]+|[#]+"
RE_SIGN = r"(-|[+])?"
RE_UINT = r"[0-9]+"
RE_INT = RE_SIGN + RE_UINT
RE_FLOAT = RE_SIGN + RE_UINT + r"([.]" + RE_UINT + ")?"
RE_HEXA = make_re_hex_digit() + r"+"
RE_IPV4 = r"((" + RE_0_255 + "[.]){3}" + RE_0_255 + ")"
RE_IPV6 = make_re_ipv6()  # make_re_ipv6_strict()
RE_LETTERS = r"[a-zA-Z]+"
RE_NET_IPV4 = "/".join([RE_IPV4, RE_0_32])
RE_NET_IPV6 = "/".join([RE_IPV6, RE_0_128])
RE_PATH = r"(/[-/:._a-zA-Z0-9]+)"
RE_SPACES = r"\s+"
RE_WORD = r"\S+"

MAP_NAME_RE = {
    "alnum": RE_ALNUM,
    "any": RE_ANY,
    "bool": RE_BOOL,
    "delimiter": RE_DELIMITER,
    "float": RE_FLOAT,
    "hexa": RE_HEXA,
    "int": RE_INT,
    "ipv4": RE_IPV4,
    "ipv6": RE_IPV6,
    "letters": RE_LETTERS,
    "net_ipv4": RE_NET_IPV4,
    "net_ipv6": RE_NET_IPV6,
    "path": RE_PATH,
    "uint": RE_UINT,
    "spaces": RE_SPACES,
    "word": RE_WORD,
}


[docs]def get_pattern_names() -> list:
    """
    Retrieves the list of patterns involved in the default pattern collection.

    Returns:
        A list of string, where each string correspond to a pattern name
        involved in the default pattern collection (``MAP_NAME_RE``).
    """
    return list(MAP_NAME_RE.keys())


[docs]def make_map_name_dfa(
    map_name_re: dict = None,
    names :iter = None
) -> dict:
    """
    Builds a dictionary that maps a list of pattern name with the corresponding
    ``pybgl.Automaton`` instance built according to regular expressions.

    Args:
        names (list): A list of string, where each string identifies a pattern names
            (by default, every keys of ``map_name_re`` is considered).
        map_name_re (dict): Maps each pattern name (``str``) with the corresponding
            regular expression (``str``). Defaults to ``None`` for the default
            pattern collection.
    """
    if map_name_re is None:
        map_name_re = MAP_NAME_RE
        if not names:
            names = list(MAP_NAME_RE.keys())
    map_name_dfa = dict()
    for name in names:
        try:
            regex = map_name_re[name]
            map_name_dfa[name] = compile_dfa(regex)
        except Exception as e:
            raise Exception("Error when processing %r: %s" % (name, e))
    return map_name_dfa


[docs]def make_dfa_empty() -> Automaton:
    """
    Builds the ``Automaton`` corresponding to the empty language.

    Returns:
        The corresponding ``Automaton``.
    """
    dfa_empty = Automaton(1)
    set_final(0, dfa_empty, False)
    return dfa_empty


[docs]def make_dfa_any(alphabet: iter = None, separator_alphabet: iter = None) -> Automaton:
    """
    Builds the DFA corresponding to the any non-separator character.

    Args:
        alphabet (iter): The characters involved in the alphabet.
            Default to ``string.printable``).
        separator (iter): The characters corresponding to separators.
            Defaults to ``{" ", "\\t", "\\n"}``.
    Returns:
        The corresponding ``Automaton``.
    """
    if not alphabet:
        alphabet = set(printable)
    if not separator_alphabet:
        separator_alphabet = {" ", "\t", "\n"}
    dfa_any = Automaton(2)
    set_final(1, dfa_any)
    restricted_alphabet = sorted(set(alphabet) - set(separator_alphabet))
    for a in restricted_alphabet:
        add_edge(0, 1, a, dfa_any)
        add_edge(1, 1, a, dfa_any)
    return dfa_any