本文将介绍地理空间数据挖掘的核心方法,包括空间聚类分析、热点探测、空间关联规则挖掘和异常检测,并提供完整的Python实现代码。
一、空间聚类分析
1.1 基于密度的空间聚类(DBSCAN)
from sklearn.cluster import DBSCAN
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
def spatial_clustering(points_gdf, eps=0.01, min_samples=5):
"""执行空间密度聚类"""
# 提取坐标数组
coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
# DBSCAN聚类
db = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean').fit(coords)
# 将聚类结果添加到GeoDataFrame
points_gdf['cluster'] = db.labels_
return points_gdf
# 使用示例
if __name__ == "__main__":
# 生成模拟点数据
np.random.seed(42)
n_points = 500
cluster_1 = np.random.normal(loc=(116.4, 39.9), scale=(0.01, 0.01), size=(n_points//2, 2))
cluster_2 = np.random.normal(loc=(116.45, 39.92), scale=(0.005, 0.005), size=(n_points//4, 2))
noise = np.random.uniform(low=(116.38, 39.88), high=(116.47, 39.94), size=(n_points//4, 2))
points = np.vstack([cluster_1, cluster_2, noise])
gdf = gpd.GeoDataFrame(geometry=[Point(xy) for xy in points])
# 执行聚类
clustered = spatial_clustering(gdf)
# 可视化结果
print(clustered['cluster'].value_counts())
clustered.plot(column='cluster', legend=True, cmap='viridis')
1.2 层次空间聚类(OPTICS)
from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt
def hierarchical_clustering(points_gdf, min_samples=10, xi=0.05):
"""执行层次空间聚类"""
coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
# OPTICS聚类
clustering = OPTICS(min_samples=min_samples, xi=xi).fit(coords)
# 添加聚类结果
points_gdf['cluster'] = clustering.labels_
points_gdf['reachability'] = clustering.reachability_[clustering.ordering_]
return points_gdf
# 使用示例
if __name__ == "__main__":
# 使用上例生成的数据
optics_result = hierarchical_clustering(gdf)
# 可视化可达性图
plt.figure(figsize=(10, 5))
plt.plot(optics_result['reachability'])
plt.title('Reachability Plot')
plt.show()
# 可视化聚类结果
optics_result.plot(column='cluster', legend=True, cmap='viridis')
二、空间热点分析
2.1 Getis-Ord Gi* 热点分析
import libpysal as lps
from esda.getisord import G_Local
import matplotlib.pyplot as plt
def hotspot_analysis(points_gdf, variable_name, k_neighbors=8):
"""执行Getis-Ord Gi*热点分析"""
# 创建空间权重矩阵
w = lps.weights.KNN.from_dataframe(points_gdf, k=k_neighbors)
w.transform = 'B' # 二值化权重
# 计算Gi*统计量
gi = G_Local(points_gdf[variable_name], w)
# 添加结果到GeoDataFrame
points_gdf['gi_score'] = gi.Zs
points_gdf['gi_pvalue'] = gi.p_norm
points_gdf['hotspot'] = np.where(gi.p_norm < 0.05,
np.where(gi.Zs > 0, 'Hot', 'Cold'),
'Not Significant')
return points_gdf
# 使用示例
if __name__ == "__main__":
# 为示例数据添加模拟变量
gdf['value'] = np.random.poisson(lam=5, size=len(gdf))
# 执行热点分析
hotspot_result = hotspot_analysis(gdf, 'value')
# 可视化热点
fig, ax = plt.subplots(figsize=(10, 8))
hotspot_result[hotspot_result['hotspot'] == 'Hot'].plot(ax=ax, color='red', markersize=50, label='Hotspot')
hotspot_result[hotspot_result['hotspot'] == 'Cold'].plot(ax=ax, color='blue', markersize=50, label='Coldspot')
hotspot_result[hotspot_result['hotspot'] == 'Not Significant'].plot(ax=ax, color='gray', markersize=5, alpha=0.5, label='Not Significant')
plt.legend()
plt.title('Hotspot Analysis Results')
plt.show()
2.2 核密度热点探测
from sklearn.neighbors import KernelDensity
from scipy.stats import gaussian_kde
def kernel_density_hotspots(points_gdf, bandwidth=0.01, gridsize=100):
"""基于核密度的热点探测"""
# 提取坐标
coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
# 计算核密度估计
kde = gaussian_kde(coords.T, bw_method=bandwidth)
# 创建网格
xmin, ymin = coords.min(axis=0)
xmax, ymax = coords.max(axis=0)
xi = np.linspace(xmin, xmax, gridsize)
yi = np.linspace(ymin, ymax, gridsize)
xx, yy = np.meshgrid(xi, yi)
grid_coords = np.vstack([xx.ravel(), yy.ravel()])
# 计算每个网格点的密度
zz = kde(grid_coords).reshape(xx.shape)
return xx, yy, zz
# 使用示例
if __name__ == "__main__":
# 计算核密度
xx, yy, zz = kernel_density_hotspots(gdf)
# 可视化
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, zz, levels=20, cmap='Reds')
plt.colorbar(label='Density')
gdf.plot(ax=plt.gca(), color='k', markersize=2, alpha=0.5)
plt.title('Kernel Density Estimation')
plt.show()
三、空间关联规则挖掘
3.1 空间共现模式分析
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
def spatial_cooccurrence(features_gdf, distance_threshold=100):
"""挖掘空间共现模式"""
# 创建空间权重矩阵
w = lps.weights.DistanceBand.from_dataframe(
features_gdf,
threshold=distance_threshold,
binary=True
)
# 创建共现矩阵
n = len(features_gdf)
cooccurrence = np.zeros((n, n))
for i in range(n):
neighbors = w.neighbors[i]
for j in neighbors:
cooccurrence[i,j] = 1
# 转换为事务数据集
transactions = []
for i in range(n):
items = [f"feature_{j}" for j in np.where(cooccurrence[i,:] == 1)[0]]
transactions.append(items)
# 使用Apriori算法挖掘频繁项集
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
return rules
# 使用示例
if __name__ == "__main__":
# 生成模拟数据
np.random.seed(42)
n_features = 100
types = ['A', 'B', 'C', 'D']
features_gdf = gpd.GeoDataFrame({
'geometry': [Point(xy) for xy in np.random.uniform(low=(116.3, 39.8), high=(116.5, 40.0), size=(n_features, 2))],
'type': np.random.choice(types, size=n_features)
})
# 挖掘共现模式
cooccurrence_rules = spatial_cooccurrence(features_gdf)
print(cooccurrence_rules.head())
四、空间异常检测
4.1 局部离群因子(LOF)空间异常检测
from sklearn.neighbors import LocalOutlierFactor
def spatial_outlier_detection(points_gdf, n_neighbors=20):
"""基于LOF的空间异常检测"""
coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
# 计算LOF得分
lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=0.1)
lof.fit_predict(coords)
# 添加结果
points_gdf['lof_score'] = -lof.negative_outlier_factor_
points_gdf['outlier'] = points_gdf['lof_score'] > np.percentile(points_gdf['lof_score'], 90)
return points_gdf
# 使用示例
if __name__ == "__main__":
# 添加一些离群点
outliers = np.array([[116.42, 39.85], [116.38, 39.92], [116.46, 39.88]])
outlier_points = [Point(xy) for xy in outliers]
gdf_outliers = gpd.GeoDataFrame(geometry=gdf.geometry.tolist() + outlier_points)
# 检测离群点
outlier_result = spatial_outlier_detection(gdf_outliers)
# 可视化
fig, ax = plt.subplots(figsize=(10, 8))
outlier_result[outlier_result['outlier']].plot(ax=ax, color='red', markersize=100, marker='x', label='Outliers')
outlier_result[~outlier_result['outlier']].plot(ax=ax, color='blue', markersize=5, alpha=0.5, label='Normal')
plt.legend()
plt.title('Spatial Outlier Detection')
plt.show()
五、总结
本文介绍了地理空间数据挖掘的核心技术:
- 空间聚类 - DBSCAN和OPTICS算法实现
- 热点分析 - Getis-Ord Gi*统计和核密度估计
- 关联规则 - 空间共现模式挖掘
- 异常检测 - 局部离群因子方法
实际应用中的关键考虑因素:
- 空间自相关性的处理
- 空间尺度的选择(全局/局部分析)
- 计算效率与精度的平衡
- 结果的可视化与解释
地理空间数据挖掘能够揭示传统分析方法难以发现的空间模式和关系,为城市规划、环境监测、公共卫生等领域提供有价值的洞察。随着空间大数据的发展,这些技术将变得更加重要。