#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# This file is part of the pattern-clustering project.
# https://github.com/nokia/pattern-clustering
"""A ``PatternAutomaton`` represents a string at the pattern level."""
__author__ = "Marc-Olivier Buob, Maxime Raynal"
__maintainer__ = "Marc-Olivier Buob, Maxime Raynal"
__email__ = "marc-olivier.buob@nokia-bell-labs.com, maxime.raynal@nokia.com"
__copyright__ = "Copyright (C) 2022, Nokia"
__license__ = "BSD-3"
from functools import partial
from pybgl.automaton import *
from pybgl.deterministic_inclusion import deterministic_inclusion
from pybgl.dijkstra_shortest_paths import dijkstra_shortest_path, make_path
from pybgl.property_map import make_assoc_property_map, make_func_property_map
from .multi_grep import *
# PatternAutomaton may possibly drop some arcs, e.g. "spaces" arcs
[docs]class PatternAutomaton(Automaton):
"""
A ``PatternAutomaton`` models a string at the pattern level using a automaton-like
structure where each vertex corresponds to a string index; each arc corresponds
to an infix and its corresponding pattern.
"""
def __init__(
self,
word: str,
map_name_dfa: dict,
make_mg: callable = None,
filtered_patterns :set = None
):
"""
Constructs the ``PatternAutomaton`` related to an input word according
to a collection of patterns and according to a ``multi_grep`` strategy.
Args:
word (str): The input string.
map_name_dfa (dict): The pattern collection mapping each pattern name (``str``)
with its corresponding ``Automaton`` instance. The ``"any"`` pattern is
always ignored.
filtered_patterns (set): A subset (possibly empty) of ``map_name_dfa.keys()``
keying the types that must be caught my ``multi_grep``, but not appearing
in the arcs involved in the ``PatternAutomaton``. It may be used for instance
to drop spaces and get a smaller ``PatternAutomaton``, but the position
of spaces in the original lines will be lost.
"""
if filtered_patterns is None:
filtered_patterns = set()
if not make_mg:
make_mg = MultiGrepFunctorLargest
_map_name_dfa = {k : v for (k, v) in map_name_dfa.items() if k != "any"}
mg = make_mg()
# Add vertices
n = len(word)
super().__init__(n + 1)
set_final(n, self)
self.w = word
# Add edges
multi_grep(word, map_name_dfa, mg)
if mg.indices():
vertices_with_successors = set()
vertices_with_predecessors = set()
for (name, jks) in mg.indices().items():
if name in filtered_patterns:
continue
for (j, k) in jks:
add_edge(j, k, name, self)
vertices_with_successors.add(j)
vertices_with_predecessors.add(k)
to_keep = sorted(vertices_with_successors | vertices_with_predecessors | {0, n})
# Remove isolated vertices
for u in range(n):
if u not in to_keep:
remove_vertex(u, self)
# Add missing "any" edges
for (i, u) in enumerate(to_keep):
if u != 0 and u not in vertices_with_predecessors:
add_edge(to_keep[i - 1], u, "any", self)
if u != n and u not in vertices_with_successors:
add_edge(u, to_keep[i + 1], "any", self)
else:
# The PatternAutomaton involves a single "any" arc
to_remove = set()
for u in vertices(self):
if u not in {0, n}:
to_remove.add(u)
for u in to_remove:
remove_vertex(u, self)
if len(word):
add_edge(0, len(word), "any", self)
[docs] def get_slice(self, e :EdgeDescriptor) -> tuple:
"""
Retrieves the slice (pair of uint indices delimiting a substring) related to an edge.
Args:
e (EdgeDescriptor): The queried edge identifier.
Returns:
The slice related to an arbitrary edge of this ``PatternAutomaton`` instance.
"""
j = source(e, self)
k = target(e, self)
return (j, k)
[docs] def get_infix(self, e: EdgeDescriptor) -> str:
"""
Retrieves the infix (substring) related to an edge.
Args:
e (EdgeDescriptor): The queried edge identifier.
Returns:
The infix related to an arbitrary edge of this ``PatternAutomaton`` instance.
"""
(j, k) = self.get_slice(e)
return self.w[j:k]
def __eq__(self, pa) -> bool:
"""
Equality operator.
This implementation assumes that the PA is deterministic and minimal,
e.g., by using the ``MultiGrepFunctorLargest`` functor in
``PatternAutomaton.__init__``.
Args:
pa (PatternAutomaton): The ``PatternAutomaton`` instance compared to ``self``.
Returns:
True iff ``self`` matches ``pa``.
"""
if num_vertices(self) != num_vertices(pa) or num_edges(self) != num_edges(pa):
# MultiGrepFunctorLargest guarantees that two PatternAutomaton can
# only be equal if they are of same size, because PatternAutomaton are
# always minimal.
return False
return deterministic_inclusion(self, pa) == 0
[docs]def pattern_automaton_edge_weight(
e: EdgeDescriptor,
g: PatternAutomaton,
map_name_density: dict = None
) -> float:
"""
Retrieves an edge weight (density).
Args:
e (EdgeDescriptor): The queried edge identifier.
g (PatternAutomaton): The queried PatternAutomaton instance.
map_name_density (dict): Maps each pattern name (``str``) with its
corresponding density (``float``)
Returns:
The corresponding density.
"""
a = label(e, g)
return map_name_density.get(a, 1.0) if map_name_density else 1.0
[docs]def pattern_automaton_to_path(g: PatternAutomaton, **kwargs) -> list:
"""
Computes the path having the lowest density (and hence describing
the best pattern-based decomposition) of an input ``PatternAutomaton``.
Args:
g (PatternAutomaton): The queried ``PatternAutomaton`` instance.
Returns:
The path minimizing the density.
"""
s = initial(g)
f = finals(g)
assert len(f) == 1
t = f.pop()
pmap_vpreds = kwargs.pop("pmap_vpreds", None)
if pmap_vpreds is None:
map_vpreds = defaultdict(set)
pmap_vpreds = make_assoc_property_map(map_vpreds)
pmap_vdist = kwargs.pop("pmap_vdist", None)
if pmap_vdist is None:
map_vdist = defaultdict()
pmap_vdist = make_assoc_property_map(map_vdist)
pmap_eweight = kwargs.pop("pmap_eweight", None)
map_name_density = kwargs.pop("map_name_density", None)
assert pmap_eweight or map_name_density
if pmap_eweight is None:
pmap_eweight = make_func_property_map(
partial(
pattern_automaton_edge_weight,
g=g,
map_name_density=map_name_density
)
)
dijkstra_shortest_path(
g, s, t,
pmap_eweight,
pmap_vpreds,
pmap_vdist,
**kwargs
)
return make_path(g, s, t, pmap_vpreds)