kdtree.py 5.92 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
"""
An implementation of KDTree using Haversine Distance for GeoSpatial analysis.
Useful tool for quickly searching for nearest neighbours.
"""

from . import Record
from numpy import inf


class KDTree:
    """
    A Haverine distance implementation of a balanced KDTree.

    This implementation is a _balanced_ KDTree, each leaf node should have the
    same number of points (or differ by 1 depending on the number of points
    the KDTree is intialised with).

    The KDTree partitions in each of the lon and lat dimensions alternatively
    in sequence by splitting at the median of the dimension of the points
    assigned to the branch.

    Parameters
    ----------
    points : list[Record]
        A list of GeoSpatialTools.Record instances.
    depth : int
        The current depth of the KDTree, you should set this to 0, it is used
        internally.
    max_depth : int
        The maximium depth of the KDTree. The leaf nodes will have depth no
        larger than this value. Leaf nodes will not be created if there is
        only 1 point in the branch.
    """

    def __init__(
        self, points: list[Record], depth: int = 0, max_depth: int = 20
    ) -> None:
        self.depth = depth
        n_points = len(points)

        if self.depth == max_depth or n_points < 2:
            self.points = points
            self.split = False
            return None

        self.axis = depth % 2
        self.variable = "lon" if self.axis == 0 else "lat"

        points.sort(key=lambda p: getattr(p, self.variable))
        split_index = n_points // 2
51
        self.partition_value = getattr(points[split_index - 1], self.variable)
52 53 54 55 56 57
        # while (
        #     split_index < n_points
        #     and getattr(points[split_index], self.variable)
        #     == self.partition_value
        # ):
        #     split_index += 1
58 59 60

        self.split = True

61
        # Left is points left of midpoint
62
        self.child_left = KDTree(points[:split_index], depth + 1)
63
        # Right is points right of midpoint
64 65 66 67 68
        self.child_right = KDTree(points[split_index:], depth + 1)

        return None

    def insert(self, point: Record) -> bool:
69 70 71 72 73
        """
        Insert a Record into the KDTree. May unbalance the KDTree.

        The point will not be inserted if it is already in the KDTree.
        """
74
        if not self.split:
75 76
            if point in self.points:
                return False
77 78 79 80 81
            self.points.append(point)
            return True

        if getattr(point, self.variable) < self.partition_value:
            return self.child_left.insert(point)
82
        elif getattr(point, self.variable) > self.partition_value:
83
            return self.child_right.insert(point)
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
        else:
            r, _ = self.query(point)
            if point in r:
                return False
            self.child_left._insert(point)
            return True

    def _insert(self, point: Record) -> None:
        """Insert a point even if it already exists in the KDTree"""
        if not self.split:
            self.points.append(point)
            return
        if getattr(point, self.variable) <= self.partition_value:
            self.child_left._insert(point)
        else:
            self.child_right._insert(point)
        return
101 102 103 104 105 106 107 108 109 110

    def delete(self, point: Record) -> bool:
        """Delete a Record from the KDTree. May unbalance the KDTree"""
        if not self.split:
            try:
                self.points.remove(point)
                return True
            except ValueError:
                return False

111 112 113 114 115 116 117
        if getattr(point, self.variable) <= self.partition_value:
            if self.child_left.delete(point):
                return True
        if getattr(point, self.variable) >= self.partition_value:
            if self.child_right.delete(point):
                return True
        return False
118

119
    def query(self, point) -> tuple[list[Record], float]:
120
        """Find the nearest Record within the KDTree to a query Record"""
121 122 123 124 125 126 127 128
        if point.lon < 0:
            point2 = Record(point.lon + 360, point.lat)
        else:
            point2 = Record(point.lon - 360, point.lat)

        r1, d1 = self._query(point)
        r2, d2 = self._query(point2)
        if d1 <= d2:
129
            return r1, d1
130
        else:
131
            return r2, d2
132 133

    def _query(
134 135
        self,
        point: Record,
136
        current_best: list[Record] | None = None,
137
        best_distance: float = inf,
138 139 140
    ) -> tuple[list[Record], float]:
        if current_best is None:
            current_best = list()
141 142 143 144
        if not self.split:
            for p in self.points:
                dist = point.distance(p)
                if dist < best_distance:
145
                    current_best = [p]
146
                    best_distance = dist
147 148
                elif dist == best_distance:
                    current_best.append(p)
149 150
            return current_best, best_distance

151
        if getattr(point, self.variable) <= self.partition_value:
152
            current_best, best_distance = self.child_left._query(
153 154 155 156
                point, current_best, best_distance
            )
            if (
                point.distance(self._get_partition_record(point))
157
                <= best_distance
158
            ):
159
                current_best, best_distance = self.child_right._query(
160 161 162
                    point, current_best, best_distance
                )
        else:
163
            current_best, best_distance = self.child_right._query(
164 165 166 167
                point, current_best, best_distance
            )
            if (
                point.distance(self._get_partition_record(point))
168
                <= best_distance
169
            ):
170
                current_best, best_distance = self.child_left._query(
171 172 173 174 175 176 177 178 179
                    point, current_best, best_distance
                )

        return current_best, best_distance

    def _get_partition_record(self, point: Record) -> Record:
        if self.variable == "lon":
            return Record(self.partition_value, point.lat)
        return Record(point.lon, self.partition_value)