35 #ifndef OPENMS_COMPARISON_CLUSTERING_GRIDBASEDCLUSTERING_H 36 #define OPENMS_COMPARISON_CLUSTERING_GRIDBASEDCLUSTERING_H 53 #include <boost/unordered_map.hpp> 68 MinimumDistance(
const int& cluster_index,
const int& nearest_neighbour_index,
const double& distance);
73 int getClusterIndex()
const;
78 int getNearestNeighbourIndex()
const;
125 template <
typename Metric>
137 typedef boost::unordered::unordered_multimap<int, MultisetIterator>::const_iterator
NNIterator;
150 const std::vector<double>& data_y,
const std::vector<int>& properties_A,
151 const std::vector<int>& properties_B, std::vector<double> grid_spacing_x,
152 std::vector<double> grid_spacing_y) :
154 grid_(grid_spacing_x, grid_spacing_y)
156 init_(data_x, data_y, properties_A, properties_B);
168 const std::vector<double>& data_y, std::vector<double> grid_spacing_x,
169 std::vector<double> grid_spacing_y) :
171 grid_(grid_spacing_x, grid_spacing_y)
174 std::vector<int> properties_A(data_x.size(), -1);
175 std::vector<int> properties_B(data_x.size(), -1);
176 init_(data_x, data_y, properties_A, properties_B);
188 Size clusters_start = clusters_.size();
194 while (clusters_.size() > 0)
196 setProgress(clusters_start - clusters_.size());
198 MultisetIterator smallest_distance_it = distances_.lower_bound(zero_distance);
200 int cluster_index1 = smallest_distance_it->getClusterIndex();
201 int cluster_index2 = smallest_distance_it->getNearestNeighbourIndex();
203 eraseMinDistance_(smallest_distance_it);
206 std::map<int, GridBasedCluster>::iterator cluster1_it = clusters_.find(cluster_index1);
207 std::map<int, GridBasedCluster>::iterator cluster2_it = clusters_.find(cluster_index2);
210 const std::vector<int>& points1 = cluster1.
getPoints();
211 const std::vector<int>& points2 = cluster2.
getPoints();
212 std::vector<int> new_points;
213 new_points.reserve(points1.size() + points2.size());
214 new_points.insert(new_points.end(), points1.begin(), points1.end());
215 new_points.insert(new_points.end(), points2.begin(), points2.end());
217 double new_x = (cluster1.
getCentre().
getX() * points1.size() + cluster2.
getCentre().
getX() * points2.size()) / (points1.size() + points2.size());
218 double new_y = (cluster1.
getCentre().
getY() * points1.size() + cluster2.
getCentre().
getY() * points2.size()) / (points1.size() + points2.size());
221 CellIndex cell_for_cluster1 = grid_.getIndex(cluster1.
getCentre());
222 CellIndex cell_for_cluster2 = grid_.getIndex(cluster2.
getCentre());
223 CellIndex cell_for_new_cluster = grid_.getIndex(
DPosition<2>(new_x, new_y));
224 grid_.removeCluster(cell_for_cluster1, cluster_index1);
225 grid_.removeCluster(cell_for_cluster2, cluster_index2);
226 grid_.addCluster(cell_for_new_cluster, cluster_index1);
231 Rectangle new_box(box1);
239 throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
"Property A of both clusters not the same. ",
"A");
245 std::vector<int> new_B;
246 new_B.reserve(B1.size() + B2.size());
247 new_B.insert(new_B.end(), B1.begin(), B1.end());
248 new_B.insert(new_B.end(), B2.begin(), B2.end());
252 clusters_.erase(cluster1_it);
253 clusters_.erase(cluster2_it);
254 clusters_.insert(std::make_pair(cluster_index1, new_cluster));
256 std::set<int> clusters_to_be_updated;
257 clusters_to_be_updated.insert(cluster_index1);
261 eraseMinDistance_(distance_it_for_cluster_idx_[cluster_index2]);
264 std::pair<NNIterator, NNIterator> nn_range = reverse_nns_.equal_range(cluster_index1);
265 for (NNIterator nn_it = nn_range.first; nn_it != nn_range.second;)
267 clusters_to_be_updated.insert(nn_it->second->getClusterIndex());
268 eraseMinDistance_((nn_it++)->second);
270 nn_range = reverse_nns_.equal_range(cluster_index2);
271 for (NNIterator nn_it = nn_range.first; nn_it != nn_range.second;)
273 clusters_to_be_updated.insert(nn_it->second->getClusterIndex());
274 eraseMinDistance_((nn_it++)->second);
278 for (std::set<int>::const_iterator cluster_index = clusters_to_be_updated.begin(); cluster_index != clusters_to_be_updated.end(); ++cluster_index)
280 std::map<int, GridBasedCluster>::iterator c_it = clusters_.find(*cluster_index);
282 if (findNearestNeighbour_(c, *cluster_index))
284 grid_.removeCluster(grid_.getIndex(c.
getCentre()), *cluster_index);
285 clusters_.erase(c_it);
301 std::vector<double> grid_spacing_x = grid_.getGridSpacingX();
302 std::vector<double> grid_spacing_y = grid_.getGridSpacingY();
303 std::vector<double> grid_spacing_y_new;
304 grid_spacing_y_new.push_back(grid_spacing_y.front());
305 grid_spacing_y_new.push_back(grid_spacing_y.back());
309 for (std::map<int, GridBasedCluster>::const_iterator it = clusters_final_.begin(); it != clusters_final_.end(); ++it)
311 int cluster_index = it->first;
318 for (
unsigned cell = 0; cell < grid_spacing_x.size(); ++cell)
320 CellIndex grid_index(cell, 1);
323 std::list<int> cluster_indices = grid_x_only.
getClusters(grid_index);
324 if (cluster_indices.size() > 1)
327 std::list<GridBasedCluster> cluster_list;
328 std::map<GridBasedCluster, int> index_list;
329 for (std::list<int>::const_iterator it = cluster_indices.begin(); it != cluster_indices.end(); ++it)
331 cluster_list.push_back(clusters_final_.find(*it)->second);
332 index_list.insert(std::make_pair(clusters_final_.find(*it)->second, *it));
337 std::list<GridBasedCluster>::const_iterator c1 = cluster_list.begin();
338 std::list<GridBasedCluster>::const_iterator c2 = cluster_list.begin();
340 while (c1 != cluster_list.end())
342 double centre1x = (*c1).getCentre().getX();
343 double centre1y = (*c1).getCentre().getY();
344 double centre2x = (*c2).getCentre().getX();
345 double centre2y = (*c2).getCentre().getY();
347 double box1x_min = (*c1).getBoundingBox().minX();
348 double box1x_max = (*c1).getBoundingBox().maxX();
349 double box1y_min = (*c1).getBoundingBox().minY();
350 double box1y_max = (*c1).getBoundingBox().maxY();
351 double box2x_min = (*c2).getBoundingBox().minX();
352 double box2x_max = (*c2).getBoundingBox().maxX();
353 double box2y_min = (*c2).getBoundingBox().minY();
354 double box2y_max = (*c2).getBoundingBox().maxY();
361 bool overlap = (box1x_min <= box2x_max && box1x_min >= box2x_min) || (box1x_max >= box2x_min && box1x_max <= box2x_max);
375 std::vector<int> points1 = (*c1).getPoints();
376 std::vector<int> points2 = (*c2).getPoints();
377 std::vector<int> new_points;
378 new_points.reserve(points1.size() + points2.size());
379 new_points.insert(new_points.end(), points1.begin(), points1.end());
380 new_points.insert(new_points.end(), points2.begin(), points2.end());
382 double new_x = (centre1x * points1.size() + centre2x * points2.size()) / (points1.size() + points2.size());
383 double new_y = (centre1y * points1.size() + centre2y * points2.size()) / (points1.size() + points2.size());
385 double min_x = std::min(box1x_min, box2x_min);
386 double min_y = std::min(box1y_min, box2y_min);
387 double max_x = std::max(box1x_max, box2x_max);
388 double max_y = std::max(box1y_max, box2y_max);
390 Point new_centre(new_x, new_y);
391 Point position_min(min_x, min_y);
392 Point position_max(max_x, max_y);
393 Rectangle new_bounding_box(position_min, position_max);
398 clusters_final_.erase(clusters_final_.find(index_list.find(*c1)->second));
399 clusters_final_.erase(clusters_final_.find(index_list.find(*c2)->second));
400 clusters_final_.insert(std::make_pair(index_list.find(*c1)->second, new_cluster));
403 CellIndex cell_for_cluster1 = grid_x_only.
getIndex((*c1).getCentre());
404 CellIndex cell_for_cluster2 = grid_x_only.
getIndex((*c2).getCentre());
405 CellIndex cell_for_new_cluster = grid_x_only.
getIndex(new_centre);
407 grid_x_only.
removeCluster(cell_for_cluster1, index_list.find(*c1)->second);
408 grid_x_only.
removeCluster(cell_for_cluster2, index_list.find(*c2)->second);
409 grid_x_only.
addCluster(cell_for_new_cluster, index_list.find(*c1)->second);
426 std::map<int, GridBasedCluster>::iterator it = clusters_final_.begin();
427 while (it != clusters_final_.end())
429 Rectangle box = it->second.getBoundingBox();
430 if (box.
maxY() - box.
minY() < threshold_y)
432 clusters_final_.erase(it++);
446 return clusters_final_;
484 boost::unordered::unordered_multimap<int, std::multiset<MinimumDistance>::const_iterator>
reverse_nns_;
500 void init_(
const std::vector<double>& data_x,
const std::vector<double>& data_y,
501 const std::vector<int>& properties_A,
const std::vector<int>& properties_B)
504 for (
unsigned i = 0; i < data_x.size(); ++i)
506 Point position(data_x[i], data_y[i]);
507 Rectangle box(position, position);
512 pb.push_back(properties_B[i]);
516 clusters_.insert(std::make_pair(i, cluster));
523 std::map<int, GridBasedCluster>::iterator iterator = clusters_.begin();
524 while (iterator != clusters_.end())
526 int cluster_index = iterator->first;
529 if (findNearestNeighbour_(cluster, cluster_index))
534 clusters_.erase(iterator++);
562 if (A1 == -1 || A2 == -1)
568 if (A1 != A2)
return true;
574 if (std::find(B1.begin(), B1.end(), -1) != B1.end() || std::find(B2.begin(), B2.end(), -1) != B2.end())
581 std::vector<int> B_intersection;
582 sort(B1.begin(), B1.end());
583 sort(B2.begin(), B2.end());
584 set_intersection(B1.begin(), B1.end(), B2.begin(), B2.end(), back_inserter(B_intersection));
586 return !B_intersection.empty();
604 const Point& centre = cluster.
getCentre();
605 const CellIndex& cell_index = grid_.
getIndex(centre);
607 int nearest_neighbour = -1;
610 for (
int i = -1; i <= 1; ++i)
612 for (
int j = -1; j <= 1; ++j)
614 CellIndex cell_index2(cell_index);
615 cell_index2.first += i;
616 cell_index2.second += j;
619 const std::list<int>& cluster_indices = grid_.
getClusters(cell_index2);
620 for (std::list<int>::const_iterator cluster_index2 = cluster_indices.begin(); cluster_index2 != cluster_indices.end(); ++cluster_index2)
622 if (*cluster_index2 != cluster_index)
625 const Point& centre2 = cluster2.
getCentre();
626 double distance = metric_(centre, centre2);
628 if (distance < min_dist || nearest_neighbour == -1)
630 bool veto = mergeVeto_(cluster, cluster2);
634 nearest_neighbour = *cluster_index2;
643 if (nearest_neighbour == -1)
646 clusters_final_.insert(std::make_pair(cluster_index, clusters_.find(cluster_index)->second));
651 std::multiset<MinimumDistance>::const_iterator it = distances_.insert(
MinimumDistance(cluster_index, nearest_neighbour, min_dist));
653 reverse_nns_.insert(std::make_pair(nearest_neighbour, it));
655 distance_it_for_cluster_idx_[cluster_index] = it;
672 std::pair<NNIterator, NNIterator> nn_range = reverse_nns_.equal_range(it->getNearestNeighbourIndex());
673 for (NNIterator nn_it = nn_range.first; nn_it != nn_range.second; ++nn_it)
675 if (nn_it->second == it)
677 reverse_nns_.erase(nn_it);
683 distance_it_for_cluster_idx_.erase(it->getClusterIndex());
686 distances_.erase(it);
void removeCluster(const CellIndex &cell_index, const int &cluster_index)
removes a cluster from this grid cell and removes the cell if no other cluster left ...
boost::unordered::unordered_multimap< int, MultisetIterator >::const_iterator NNIterator
Definition: GridBasedClustering.h:137
std::pair< int, int > CellIndex
Definition: ClusteringGrid.h:63
double distance_
distance between cluster and its nearest neighbour
Definition: GridBasedClustering.h:106
A more convenient string class.
Definition: String.h:57
std::map< int, GridBasedCluster > clusters_final_
list of final clusters i.e. clusters that are no longer merged
Definition: GridBasedClustering.h:472
void enlarge(const PositionType &p)
Enlarges the bounding box such that it contains a position.
Definition: DBoundingBox.h:122
boost::unordered::unordered_multimap< int, std::multiset< MinimumDistance >::const_iterator > reverse_nns_
reverse nearest neighbor lookup table for finding out which clusters need to be updated faster ...
Definition: GridBasedClustering.h:484
std::multiset< MinimumDistance > distances_
list of minimum distances stores the smallest of the distances in the head
Definition: GridBasedClustering.h:478
GridBasedCluster::Point Point
cluster centre, cluster bounding box, grid index
Definition: GridBasedClustering.h:133
void removeSmallClustersY(double threshold_y)
removes clusters with bounding box dimension in y-direction below certain threshold ...
Definition: GridBasedClustering.h:424
bool operator==(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:806
void cluster()
performs the hierarchical clustering (merges clusters until their dimension exceeds that of cell) ...
Definition: GridBasedClustering.h:183
CoordinateType getY() const
Name accessor for the second dimension. Only for DPosition<2>, for visualization. ...
Definition: DPosition.h:158
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
const std::vector< int > & getPropertiesB() const
returns properties B of all points
ClusteringGrid grid_
grid on which the position of the clusters are registered used in cluster method
Definition: GridBasedClustering.h:460
GridBasedCluster::Rectangle Rectangle
Definition: GridBasedClustering.h:134
bool operator<(const MultiplexDeltaMasses &dm1, const MultiplexDeltaMasses &dm2)
std::map< int, GridBasedCluster > clusters_
list of clusters maps cluster indices to clusters
Definition: GridBasedClustering.h:466
PositionType const & maxPosition() const
Accessor to maximum position.
Definition: DIntervalBase.h:128
void extendClustersY()
extends clusters in y-direction if possible (merges clusters further in y-direction, i.e. clusters can now span multiple cells)
Definition: GridBasedClustering.h:297
GridBasedClustering(Metric metric, const std::vector< double > &data_x, const std::vector< double > &data_y, std::vector< double > grid_spacing_x, std::vector< double > grid_spacing_y)
initialises all data structures
Definition: GridBasedClustering.h:167
2D hierarchical clustering implementation optimized for large data sets containing many small cluster...
Definition: GridBasedClustering.h:126
CoordinateType getX() const
Name accessor for the first dimension. Only for DPosition<2>, for visualization.
Definition: DPosition.h:151
bool mergeVeto_(const GridBasedCluster &c1, const GridBasedCluster &c2) const
checks if two clusters can be merged Each point in a cluster can (optionally) have two properties A a...
Definition: GridBasedClustering.h:556
bool findNearestNeighbour_(const GridBasedCluster &cluster, int cluster_index)
determines the nearest neighbour for each cluster
Definition: GridBasedClustering.h:602
void addCluster(const CellIndex &cell_index, const int &cluster_index)
adds a cluster to this grid cell
data structure to store 2D data to be clustered e.g. (m/z, retention time) coordinates from multiplex...
Definition: ClusteringGrid.h:57
int cluster_index_
index in the cluster list
Definition: GridBasedClustering.h:96
void init_(const std::vector< double > &data_x, const std::vector< double > &data_y, const std::vector< int > &properties_A, const std::vector< int > &properties_B)
initialises all data structures
Definition: GridBasedClustering.h:500
bool isNonEmptyCell(const CellIndex &cell_index) const
checks if there are clusters at this cell index
GridBasedClustering(Metric metric, const std::vector< double > &data_x, const std::vector< double > &data_y, const std::vector< int > &properties_A, const std::vector< int > &properties_B, std::vector< double > grid_spacing_x, std::vector< double > grid_spacing_y)
initialises all data structures
Definition: GridBasedClustering.h:149
Metric metric_
metric for measuring the distance between points in the 2D plane
Definition: GridBasedClustering.h:454
CoordinateType minY() const
Accessor for max_ coordinate minimum.
Definition: DIntervalBase.h:247
const Rectangle & getBoundingBox() const
returns bounding box
CoordinateType maxY() const
Accessor for max_ coordinate maximum.
Definition: DIntervalBase.h:259
Invalid value exception.
Definition: Exception.h:336
const Point & getCentre() const
returns cluster centre
std::list< int > getClusters(const CellIndex &cell_index) const
returns clusters in this grid cell
const std::vector< int > & getPoints() const
returns indices of points in cluster
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:128
CellIndex getIndex(const Point &position) const
returns grid cell index (i,j) for the positions (x,y)
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:55
void eraseMinDistance_(const std::multiset< MinimumDistance >::const_iterator it)
remove minimum distance object and its related data
Definition: GridBasedClustering.h:669
ClusteringGrid::CellIndex CellIndex
Definition: GridBasedClustering.h:135
boost::unordered::unordered_map< int, std::multiset< MinimumDistance >::const_iterator > distance_it_for_cluster_idx_
cluster index to distance iterator lookup table for finding out which clusters need to be updated fas...
Definition: GridBasedClustering.h:490
basic data structure for distances between clusters
Definition: GridBasedClustering.h:61
std::multiset< MinimumDistance >::const_iterator MultisetIterator
Definition: GridBasedClustering.h:136
basic data structure for clustering
Definition: GridBasedCluster.h:55
int nearest_neighbour_index_
index of the nearest neighbour of the above cluster
Definition: GridBasedClustering.h:101
std::map< int, GridBasedCluster > getResults() const
returns final results (mapping of cluster indices to clusters)
Definition: GridBasedClustering.h:444
int getPropertyA() const
returns property A
PositionType const & minPosition() const
Accessor to minimum position.
Definition: DIntervalBase.h:122