DBSCANClusterer.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.math3.stat.clustering;

  18. import java.util.ArrayList;
  19. import java.util.Collection;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.List;
  23. import java.util.Map;
  24. import java.util.Set;

  25. import org.apache.commons.math3.exception.NotPositiveException;
  26. import org.apache.commons.math3.exception.NullArgumentException;
  27. import org.apache.commons.math3.util.MathUtils;

  28. /**
  29.  * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
  30.  * <p>
  31.  * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
  32.  * a point p is density connected to another point q, if there exists a chain of
  33.  * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
  34.  * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
  35.  * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
  36.  * of this point.
  37.  * <p>
  38.  * Any point that is not density-reachable from a formed cluster is treated as noise, and
  39.  * will thus not be present in the result.
  40.  * <p>
  41.  * The algorithm requires two parameters:
  42.  * <ul>
  43.  *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
  44.  *   <li>minPoints: the minimum number of density-connected points required to form a cluster
  45.  * </ul>
  46.  * <p>
  47.  * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
  48.  * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
  49.  * return {@code null}.
  50.  *
  51.  * @param <T> type of the points to cluster
  52.  * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
  53.  * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
  54.  * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
  55.  * @since 3.1
  56.  * @deprecated As of 3.2 (to be removed in 4.0),
  57.  * use {@link org.apache.commons.math3.ml.clustering.DBSCANClusterer} instead
  58.  */
  59. @Deprecated
  60. public class DBSCANClusterer<T extends Clusterable<T>> {

  61.     /** Maximum radius of the neighborhood to be considered. */
  62.     private final double              eps;

  63.     /** Minimum number of points needed for a cluster. */
  64.     private final int                 minPts;

  65.     /** Status of a point during the clustering process. */
  66.     private enum PointStatus {
  67.         /** The point has is considered to be noise. */
  68.         NOISE,
  69.         /** The point is already part of a cluster. */
  70.         PART_OF_CLUSTER
  71.     }

  72.     /**
  73.      * Creates a new instance of a DBSCANClusterer.
  74.      *
  75.      * @param eps maximum radius of the neighborhood to be considered
  76.      * @param minPts minimum number of points needed for a cluster
  77.      * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
  78.      */
  79.     public DBSCANClusterer(final double eps, final int minPts)
  80.         throws NotPositiveException {
  81.         if (eps < 0.0d) {
  82.             throw new NotPositiveException(eps);
  83.         }
  84.         if (minPts < 0) {
  85.             throw new NotPositiveException(minPts);
  86.         }
  87.         this.eps = eps;
  88.         this.minPts = minPts;
  89.     }

  90.     /**
  91.      * Returns the maximum radius of the neighborhood to be considered.
  92.      *
  93.      * @return maximum radius of the neighborhood
  94.      */
  95.     public double getEps() {
  96.         return eps;
  97.     }

  98.     /**
  99.      * Returns the minimum number of points needed for a cluster.
  100.      *
  101.      * @return minimum number of points needed for a cluster
  102.      */
  103.     public int getMinPts() {
  104.         return minPts;
  105.     }

  106.     /**
  107.      * Performs DBSCAN cluster analysis.
  108.      * <p>
  109.      * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
  110.      * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
  111.      * return {@code null}.
  112.      *
  113.      * @param points the points to cluster
  114.      * @return the list of clusters
  115.      * @throws NullArgumentException if the data points are null
  116.      */
  117.     public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {

  118.         // sanity checks
  119.         MathUtils.checkNotNull(points);

  120.         final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>();
  121.         final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>();

  122.         for (final T point : points) {
  123.             if (visited.get(point) != null) {
  124.                 continue;
  125.             }
  126.             final List<T> neighbors = getNeighbors(point, points);
  127.             if (neighbors.size() >= minPts) {
  128.                 // DBSCAN does not care about center points
  129.                 final Cluster<T> cluster = new Cluster<T>(null);
  130.                 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
  131.             } else {
  132.                 visited.put(point, PointStatus.NOISE);
  133.             }
  134.         }

  135.         return clusters;
  136.     }

  137.     /**
  138.      * Expands the cluster to include density-reachable items.
  139.      *
  140.      * @param cluster Cluster to expand
  141.      * @param point Point to add to cluster
  142.      * @param neighbors List of neighbors
  143.      * @param points the data set
  144.      * @param visited the set of already visited points
  145.      * @return the expanded cluster
  146.      */
  147.     private Cluster<T> expandCluster(final Cluster<T> cluster,
  148.                                      final T point,
  149.                                      final List<T> neighbors,
  150.                                      final Collection<T> points,
  151.                                      final Map<Clusterable<T>, PointStatus> visited) {
  152.         cluster.addPoint(point);
  153.         visited.put(point, PointStatus.PART_OF_CLUSTER);

  154.         List<T> seeds = new ArrayList<T>(neighbors);
  155.         int index = 0;
  156.         while (index < seeds.size()) {
  157.             final T current = seeds.get(index);
  158.             PointStatus pStatus = visited.get(current);
  159.             // only check non-visited points
  160.             if (pStatus == null) {
  161.                 final List<T> currentNeighbors = getNeighbors(current, points);
  162.                 if (currentNeighbors.size() >= minPts) {
  163.                     seeds = merge(seeds, currentNeighbors);
  164.                 }
  165.             }

  166.             if (pStatus != PointStatus.PART_OF_CLUSTER) {
  167.                 visited.put(current, PointStatus.PART_OF_CLUSTER);
  168.                 cluster.addPoint(current);
  169.             }

  170.             index++;
  171.         }
  172.         return cluster;
  173.     }

  174.     /**
  175.      * Returns a list of density-reachable neighbors of a {@code point}.
  176.      *
  177.      * @param point the point to look for
  178.      * @param points possible neighbors
  179.      * @return the List of neighbors
  180.      */
  181.     private List<T> getNeighbors(final T point, final Collection<T> points) {
  182.         final List<T> neighbors = new ArrayList<T>();
  183.         for (final T neighbor : points) {
  184.             if (point != neighbor && neighbor.distanceFrom(point) <= eps) {
  185.                 neighbors.add(neighbor);
  186.             }
  187.         }
  188.         return neighbors;
  189.     }

  190.     /**
  191.      * Merges two lists together.
  192.      *
  193.      * @param one first list
  194.      * @param two second list
  195.      * @return merged lists
  196.      */
  197.     private List<T> merge(final List<T> one, final List<T> two) {
  198.         final Set<T> oneSet = new HashSet<T>(one);
  199.         for (T item : two) {
  200.             if (!oneSet.contains(item)) {
  201.                 one.add(item);
  202.             }
  203.         }
  204.         return one;
  205.     }
  206. }