@inproceedings{a8d415a9f0b3453ca7b5251646fae0ad,
title = "MR-DBSCAN: An efficient parallel density-based clustering algorithm using MapReduce",
abstract = "Data clustering is an important data mining technology that plays a crucial role in numerous scientific applications. However, it is challenging due to the size of datasets has been growing rapidly to extra-large scale in the real world. Meanwhile, MapReduce is a desirable parallel programming platform that is widely applied in kinds of data process fields. In this paper, we propose an efficient parallel density-based clustering algorithm and implement it by a 4-stages MapReduce paradigm. Furthermore, we adopt a quick partitioning strategy for large scale non-indexed data. We study the metric of merge among bordering partitions and make optimizations on it. At last, we evaluate our work on real large scale datasets using Hadoop platform. Results reveal that the speedup and scaleup of our work are very efficient.",
keywords = "DBSCAN, Data mining, Mapreduce, Parallel system",
author = "Yaobin He and Haoyu Tan and Wuman Luo and Huajian Mao and Di Ma and Shengzhong Feng and Jianping Fan",
year = "2011",
doi = "10.1109/ICPADS.2011.83",
language = "English",
isbn = "9780769545769",
series = "Proceedings of the International Conference on Parallel and Distributed Systems - ICPADS",
pages = "473--480",
booktitle = "Proceedings - 2011 17th IEEE International Conference on Parallel and Distributed Systems, ICPADS 2011",
note = "2011 17th IEEE International Conference on Parallel and Distributed Systems, ICPADS 2011 ; Conference date: 07-12-2011 Through 09-12-2011",
}