@inproceedings{4f48a7306c7044f6877f49fc354c5484,
title = "Vertex deduplication based on string similarity and community membership",
abstract = "Entity resolution is a challenging problem with unresolved and duplicated entities common in many large real world datasets. New methods are required for addressing this problem as the use of graphs to model data continues to proliferate. In this paper we propose a general framework for the fast resolution of duplicate vertices in graphs. Our framework utilises locality sensitive hashing for the quick identification of potential duplicates based on string similarity. However it is clear that in many tasks string similarity alone is not enough to determine duplication. This motivates the second aspect of our method which discovers the community structure in the graph using an ensemble of community detection algorithms. These communities are then used to augment the string similarity in the deduplication process. We evaluate our approach on a real world graph consisting of 620885 vertices and 1129986 edges and report a high accuracy score on a commercial real world graph.",
author = "Ryan McConville and Weiru Liu and Jun Hong",
year = "2017",
month = nov,
day = "27",
doi = "10.1007/978-3-319-72150-7_15",
language = "English",
isbn = "9783319721491",
series = "Studies in Computational Intelligence",
publisher = "Springer, Cham",
pages = "178--189",
booktitle = "Complex Networks and Their Applications VI",
address = "Switzerland",
note = "6th International Conference on Complex Networks and Their Applications, Complex Networks 2017 ; Conference date: 29-11-2017 Through 01-12-2017",
}