Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Max weighted matching #107

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 217 additions & 22 deletions recordlinkage/network.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import networkx as nx
jpweytjens marked this conversation as resolved.
Show resolved Hide resolved
import pandas as pd

from recordlinkage.types import is_pandas_2d_multiindex
from recordlinkage.types import is_pandas_multiindex
from recordlinkage.types import (
is_pandas_2d_multiindex,
is_pandas_like,
is_pandas_multiindex,
)


class OneToOneLinking(object):
Expand All @@ -16,8 +19,7 @@ class OneToOneLinking(object):
Parameters
----------
method : str
The method to solve the problem. Only 'greedy' is supported at
the moment.
The method to solve the problem. The options are 'greedy' and "max_weighted". The "max_weighted" option solves the assignment problem, i.e. it finds the one to one matching with the greatest combined weight of all links. The matching is done with the Blossom algorithm by Jack Edmonds as implemented in networkx. For more details, see https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.matching.max_weight_matching.html.

Note
----
Expand All @@ -26,11 +28,82 @@ class OneToOneLinking(object):

"""

def __init__(self, method='greedy'):
def __init__(self, method="greedy"):
super(OneToOneLinking, self).__init__()

self.method = method

def add_weights(self, links, features=None, classifier=None, method="weights"):
"""Add match weights to the candidate matches.

Parameters
----------
links : pandas.MultiIndex
The candidate matches as calculated by a recordlinkage classifier.
features : pandas.DataFrame
The dataframe with similarity weights as calculated by a recordlinkage.Compare object.
classifier : recordlinkage.base.Classifier
The classifier used to classify the records in matches and non-matches.
method : str
The method to assign weights to the candidate matces. The options are 'weights', 'log_weights' and 'probabilities'. The 'weights' features requires the features to be passed. It adds the sum of the similarity weights from features to the links. Both 'log_weights' and 'probabilities' require that the classifier is passed. 'log_weights' adds the matching weight as defined in the Fellegi-Sunter framework. These weights can be negative, but the "max_weighted" linking strategy can't handle negative weights. All matching weights are offset with the largest total negative matching weight, so all the weights are greater than or equal to 0. This method is only available for the ECM and NaiveBayes classifier. 'probabilities' adds the probabilities that the record pair is a match as a weight. This method is available for every classifier.

Returns
-------
pandas.Series

Example
-------

Consider a MultiIndex with record pairs constructed from datasets A
and B. The candidate matches are determined with a classifier. To link a candidate match from A to at most one record of B with the "max_weighted" method, weights need to be added to the candidate matches. This can be done by using the features or the classifier. Given the following set up:

> indexer = Index()
> indexer.full()
> record_pairs = indexer.index(A, B)
> comparator = Compare(compare.String("A_string", "B_string"))
> features = comparator.compute(record_pairs, A, B)
> ecm = ECMClassifier()
> candidates = ecm.fit_predict(features)
> one_to_one = OneToOneLinking(method="max_weighted")

Weights can be added with the following syntax:

> candidates_weights = one_to_one.add_weights(candidates, features=features, method="weights")
> candidates_log_weights = one_to_one.add_weights(candidates, classifier=ecm, method="log_weights")

"""

# get subset of data that correponds with the multiindex links
difference = features.index.difference(links)
features = features.drop(index=difference)

if method == "weights" or method == "log_weights":
initial_columns = features.columns

if method == "weights":
weight = features.sum(axis=1)

elif method == "log_weights":
# calculate the total log weight for each row
weight = pd.Series(0, index=features.index)
for column, weights in classifier.log_weights.items():
weight += features[column].apply(lambda x: weights[x])

# offset negative values
min_weight = weight.min()
if min_weight < 0:
weight = weight - min_weight

# add the weight and remove all other columns
features = features.assign(weight=weight)
links = features.drop(columns=initial_columns).squeeze()

elif method == "probabilities":

links = classifier.prob(features)

return links

@classmethod
def _bool_duplicated(cls, links, level):

Expand All @@ -50,32 +123,157 @@ def _compute_greedy(self, links):

return pd.MultiIndex.from_tuples(result)

def _compute_max_weighted(self, links):
"""Compute a one to one linking by maximizing the total similarity weight."""

graph = self._to_weighted_bipartite_graph(links)
max_weighted_graph = self._max_weighted_graph(graph)
max_weighted_series = self._to_max_weighted_series(max_weighted_graph)

return max_weighted_series

def _max_weighted_graph(self, graph):
"""Calculate the maximally weighted bipartite graph with the Blossom algorithm by Edmonds."""

# max weight matching
max_weighted_edges = nx.algorithms.matching.max_weight_matching(graph)

# restore order after matching
max_weighted_edges = self._order_max_weighted_bipartite_graph(
graph, max_weighted_edges
)

# create maximally weighted graph
weights = [graph[u][v]["weight"] for u, v in max_weighted_edges]
max_weighted_left = [edge[0] for edge in max_weighted_edges]
max_weighted_right = [edge[1] for edge in max_weighted_edges]

max_weighted_graph = nx.Graph()

max_weighted_graph.add_nodes_from(max_weighted_left, bipartite=0)
max_weighted_graph.add_nodes_from(max_weighted_right, bipartite=1)
max_weighted_graph.add_weighted_edges_from(
list(zip(max_weighted_left, max_weighted_right, weights))
)

return max_weighted_graph

def _compute(self, links):
if not is_pandas_2d_multiindex(links):

if self.method == "greedy":
if not is_pandas_multiindex(links):
raise TypeError("expected pandas.MultiIndex")
elif not is_pandas_2d_multiindex(links):
if not is_pandas_2d_multiindex(links):
raise ValueError(
"pandas.MultiIndex has incorrect number of "
"levels (expected 2 levels)")
"levels (expected 2 levels)"
)

if self.method == 'greedy':
return self._compute_greedy(links)

elif self.method == "max_weighted":
if not is_pandas_like(links):
raise TypeError(
"expected pandas.Series with a MultiIndex and weights as values"
)
if not is_pandas_2d_multiindex(links.index):
raise ValueError(
"pandas.MultiIndex has incorrect number of "
"levels (expected 2 levels)"
)
return self._compute_max_weighted(links)

else:
raise ValueError("unknown matching method {}".format(self.method))

def _order_max_weighted_bipartite_graph(self, graph, max_weighted_edges):
"""Swaps the order of edges that are swapped after max weight matching."""

edges_left = list(set(edge[0] for edge in graph.edges))

max_weighted_left = [edge[0] for edge in max_weighted_edges]
max_weighted_right = [edge[1] for edge in max_weighted_edges]

for i, value in enumerate(max_weighted_left):
if value not in edges_left:
max_weighted_left[i], max_weighted_right[i] = (
max_weighted_right[i],
max_weighted_left[i],
)

ordered_max_weighted_edges = list(zip(max_weighted_left, max_weighted_right))

return ordered_max_weighted_edges

def _to_weighted_bipartite_graph(self, links):
"""Convert a Series with MultiIndex and weights to a bipartite graph with weighted edges."""

# don't change the passed series
tmp = links.copy()
# add labels to both multiindex levels to ensure no overlap of nodes in the graph
tmp.index = self._add_node_labels_to_multiindex(tmp.index)

# create the graph
graph = nx.Graph()

left = tmp.index.levels[0]
right = tmp.index.levels[1]
values = tmp.values

graph.add_nodes_from(left, bipartite=0)
graph.add_nodes_from(right, bipartite=1)

graph.add_weighted_edges_from(list(zip(left, right, values)))

return graph

def _to_max_weighted_series(self, graph):
"""Convert a (max weighted) bipartite graph to a Series."""

max_weighted_series = nx.to_pandas_edgelist(graph)

# ensure output format is the same as the format of the initial candidate links
max_weighted_series = max_weighted_series.set_index(
["source", "target"]
).squeeze()
max_weighted_series.index.names = [None, None]
max_weighted_series.index = self._remove_node_labels_from_multiindex(
max_weighted_series.index
)

return max_weighted_series

def _add_node_labels_to_multiindex(self, multiindex, labels=["left_", "right_"]):
"""Adds labels to a MultiIndex. This is done in order to distinguish the left and right dataset during the max weighted matching algorithm."""

for i, (level, dataset) in enumerate(zip(multiindex.levels, labels)):
stringified_level = [dataset + str(value) for value in level]
multiindex = multiindex.set_levels(stringified_level, i)

return multiindex

def _remove_node_labels_from_multiindex(
self, multiindex, labels=["left_", "right_"]
):

for i, (level, label) in enumerate(zip(multiindex.levels, labels)):
destringified_level = [int(value.replace(label, "")) for value in level]
multiindex = multiindex.set_levels(destringified_level, i)

return multiindex

def compute(self, links):
"""Compute the one-to-one linking.

Parameters
----------
links : pandas.MultiIndex
The pairs to apply linking to.
links : pandas.MultiIndex or pandas.Series
The pairs to apply linking to. Should be a pandas.MultiIndex for the 'greedy' and a pandas.Series for the 'max_weighted' method.

Returns
-------
pandas.MultiIndex
A one-to-one matched MultiIndex of record pairs.
pandas.MultiIndex or pandas.Series
A one-to-one matched MultiIndex of record pairs for the 'greedy' method and a pandas.Series with one-to-one matched record pairs and their matching weight.

"""

Expand Down Expand Up @@ -122,7 +320,7 @@ class OneToManyLinking(OneToOneLinking):

"""

def __init__(self, level=0, method='greedy'):
def __init__(self, level=0, method="greedy"):
super(OneToManyLinking, self).__init__(method=method)

self.level = level
Expand Down Expand Up @@ -181,16 +379,13 @@ def compute(self, links):

"""

try:
import networkx as nx
except ImportError():
raise Exception("'networkx' module is needed for this operation")

G = nx.Graph()
G.add_edges_from(links.values)
connected_components = nx.connected_component_subgraphs(G)

links_result = [pd.MultiIndex.from_tuples(subgraph.edges())
for subgraph in connected_components]
links_result = [
pd.MultiIndex.from_tuples(subgraph.edges())
for subgraph in connected_components
]

return links_result
1 change: 1 addition & 0 deletions setup.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def read(fname):
install_requires=[
"six>=1.10.0",
"jellyfish>=0.5.4",
"networkx>=2.0",
"numpy>=1.13.0",
"pandas>=0.18.0",
"scipy>=0.17.1",
Expand Down
18 changes: 17 additions & 1 deletion tests/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
ConnectedComponents)


def test_one_to_one_linking():
def test_one_to_one_linking_greedy():

sample = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3), (3, 4), (3, 5),
(4, 4), (5, 5), (6, 5), (7, 7), (7, 7),
Expand All @@ -23,6 +23,22 @@ def test_one_to_one_linking():
ptm.assert_index_equal(sample_one_to_many, expected)


def test_one_to_one_linking_max_weighted():

sample_index = pd.MultiIndex.from_tuples([(1, 1), (2, 1), (2, 2), (2, 3), (3, 3)])
sample_data = {"c1": [0, 1, 0, 1, 0], "c2": [1, 1, 1, 1, 1], "c3": [1, 1, 1, 1, 1]}
sample = pd.DataFrame(data=sample_data, index=sample_index)

one_to_one = OneToOneLinking(method="max_weighted")
sample_one_to_one = one_to_one.compute(sample_index, sample)

expected_index = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)])
expected_data = {"weight": [2, 2, 2]}
expected = pd.DataFrame(data=expected_data, index=expected_index)

ptm.assert_frame_equal(sample_one_to_one, expected)


def test_one_to_many_linking():

sample = pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3), (3, 4), (3, 5),
Expand Down
1 change: 0 additions & 1 deletion tox.ini
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ deps=
pandaslatest: pandas
commands=
pip install -e .
pip install networkx
pytest --cov-config .coveragerc --cov=recordlinkage --cov-append --cov-report=xml

[travis:env]
Expand Down