地理空间数据挖掘与模式识别技术-CFANZ编程社区

本文将介绍地理空间数据挖掘的核心方法，包括空间聚类分析、热点探测、空间关联规则挖掘和异常检测，并提供完整的Python实现代码。

一、空间聚类分析

1.1 基于密度的空间聚类（DBSCAN）

from sklearn.cluster import DBSCAN
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

def spatial_clustering(points_gdf, eps=0.01, min_samples=5):
    """执行空间密度聚类"""
    # 提取坐标数组
    coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
    
    # DBSCAN聚类
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean').fit(coords)
    
    # 将聚类结果添加到GeoDataFrame
    points_gdf['cluster'] = db.labels_
    
    return points_gdf

# 使用示例
if __name__ == "__main__":
    # 生成模拟点数据
    np.random.seed(42)
    n_points = 500
    cluster_1 = np.random.normal(loc=(116.4, 39.9), scale=(0.01, 0.01), size=(n_points//2, 2))
    cluster_2 = np.random.normal(loc=(116.45, 39.92), scale=(0.005, 0.005), size=(n_points//4, 2))
    noise = np.random.uniform(low=(116.38, 39.88), high=(116.47, 39.94), size=(n_points//4, 2))
    
    points = np.vstack([cluster_1, cluster_2, noise])
    gdf = gpd.GeoDataFrame(geometry=[Point(xy) for xy in points])
    
    # 执行聚类
    clustered = spatial_clustering(gdf)
    
    # 可视化结果
    print(clustered['cluster'].value_counts())
    clustered.plot(column='cluster', legend=True, cmap='viridis')

1.2 层次空间聚类（OPTICS）

from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt

def hierarchical_clustering(points_gdf, min_samples=10, xi=0.05):
    """执行层次空间聚类"""
    coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
    
    # OPTICS聚类
    clustering = OPTICS(min_samples=min_samples, xi=xi).fit(coords)
    
    # 添加聚类结果
    points_gdf['cluster'] = clustering.labels_
    points_gdf['reachability'] = clustering.reachability_[clustering.ordering_]
    
    return points_gdf

# 使用示例
if __name__ == "__main__":
    # 使用上例生成的数据
    optics_result = hierarchical_clustering(gdf)
    
    # 可视化可达性图
    plt.figure(figsize=(10, 5))
    plt.plot(optics_result['reachability'])
    plt.title('Reachability Plot')
    plt.show()
    
    # 可视化聚类结果
    optics_result.plot(column='cluster', legend=True, cmap='viridis')

二、空间热点分析

2.1 Getis-Ord Gi* 热点分析

import libpysal as lps
from esda.getisord import G_Local
import matplotlib.pyplot as plt

def hotspot_analysis(points_gdf, variable_name, k_neighbors=8):
    """执行Getis-Ord Gi*热点分析"""
    # 创建空间权重矩阵
    w = lps.weights.KNN.from_dataframe(points_gdf, k=k_neighbors)
    w.transform = 'B'  # 二值化权重
    
    # 计算Gi*统计量
    gi = G_Local(points_gdf[variable_name], w)
    
    # 添加结果到GeoDataFrame
    points_gdf['gi_score'] = gi.Zs
    points_gdf['gi_pvalue'] = gi.p_norm
    points_gdf['hotspot'] = np.where(gi.p_norm < 0.05, 
                                   np.where(gi.Zs > 0, 'Hot', 'Cold'), 
                                   'Not Significant')
    
    return points_gdf

# 使用示例
if __name__ == "__main__":
    # 为示例数据添加模拟变量
    gdf['value'] = np.random.poisson(lam=5, size=len(gdf))
    
    # 执行热点分析
    hotspot_result = hotspot_analysis(gdf, 'value')
    
    # 可视化热点
    fig, ax = plt.subplots(figsize=(10, 8))
    hotspot_result[hotspot_result['hotspot'] == 'Hot'].plot(ax=ax, color='red', markersize=50, label='Hotspot')
    hotspot_result[hotspot_result['hotspot'] == 'Cold'].plot(ax=ax, color='blue', markersize=50, label='Coldspot')
    hotspot_result[hotspot_result['hotspot'] == 'Not Significant'].plot(ax=ax, color='gray', markersize=5, alpha=0.5, label='Not Significant')
    plt.legend()
    plt.title('Hotspot Analysis Results')
    plt.show()

2.2 核密度热点探测

from sklearn.neighbors import KernelDensity
from scipy.stats import gaussian_kde

def kernel_density_hotspots(points_gdf, bandwidth=0.01, gridsize=100):
    """基于核密度的热点探测"""
    # 提取坐标
    coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
    
    # 计算核密度估计
    kde = gaussian_kde(coords.T, bw_method=bandwidth)
    
    # 创建网格
    xmin, ymin = coords.min(axis=0)
    xmax, ymax = coords.max(axis=0)
    xi = np.linspace(xmin, xmax, gridsize)
    yi = np.linspace(ymin, ymax, gridsize)
    xx, yy = np.meshgrid(xi, yi)
    grid_coords = np.vstack([xx.ravel(), yy.ravel()])
    
    # 计算每个网格点的密度
    zz = kde(grid_coords).reshape(xx.shape)
    
    return xx, yy, zz

# 使用示例
if __name__ == "__main__":
    # 计算核密度
    xx, yy, zz = kernel_density_hotspots(gdf)
    
    # 可视化
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, zz, levels=20, cmap='Reds')
    plt.colorbar(label='Density')
    gdf.plot(ax=plt.gca(), color='k', markersize=2, alpha=0.5)
    plt.title('Kernel Density Estimation')
    plt.show()

三、空间关联规则挖掘

3.1 空间共现模式分析

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

def spatial_cooccurrence(features_gdf, distance_threshold=100):
    """挖掘空间共现模式"""
    # 创建空间权重矩阵
    w = lps.weights.DistanceBand.from_dataframe(
        features_gdf, 
        threshold=distance_threshold,
        binary=True
    )
    
    # 创建共现矩阵
    n = len(features_gdf)
    cooccurrence = np.zeros((n, n))
    
    for i in range(n):
        neighbors = w.neighbors[i]
        for j in neighbors:
            cooccurrence[i,j] = 1
    
    # 转换为事务数据集
    transactions = []
    for i in range(n):
        items = [f"feature_{j}" for j in np.where(cooccurrence[i,:] == 1)[0]]
        transactions.append(items)
    
    # 使用Apriori算法挖掘频繁项集
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
    
    return rules

# 使用示例
if __name__ == "__main__":
    # 生成模拟数据
    np.random.seed(42)
    n_features = 100
    types = ['A', 'B', 'C', 'D']
    features_gdf = gpd.GeoDataFrame({
        'geometry': [Point(xy) for xy in np.random.uniform(low=(116.3, 39.8), high=(116.5, 40.0), size=(n_features, 2))],
        'type': np.random.choice(types, size=n_features)
    })
    
    # 挖掘共现模式
    cooccurrence_rules = spatial_cooccurrence(features_gdf)
    print(cooccurrence_rules.head())

四、空间异常检测

4.1 局部离群因子（LOF）空间异常检测

from sklearn.neighbors import LocalOutlierFactor

def spatial_outlier_detection(points_gdf, n_neighbors=20):
    """基于LOF的空间异常检测"""
    coords = np.array([[point.x, point.y] for point in points_gdf.geometry])
    
    # 计算LOF得分
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=0.1)
    lof.fit_predict(coords)
    
    # 添加结果
    points_gdf['lof_score'] = -lof.negative_outlier_factor_
    points_gdf['outlier'] = points_gdf['lof_score'] > np.percentile(points_gdf['lof_score'], 90)
    
    return points_gdf

# 使用示例
if __name__ == "__main__":
    # 添加一些离群点
    outliers = np.array([[116.42, 39.85], [116.38, 39.92], [116.46, 39.88]])
    outlier_points = [Point(xy) for xy in outliers]
    gdf_outliers = gpd.GeoDataFrame(geometry=gdf.geometry.tolist() + outlier_points)
    
    # 检测离群点
    outlier_result = spatial_outlier_detection(gdf_outliers)
    
    # 可视化
    fig, ax = plt.subplots(figsize=(10, 8))
    outlier_result[outlier_result['outlier']].plot(ax=ax, color='red', markersize=100, marker='x', label='Outliers')
    outlier_result[~outlier_result['outlier']].plot(ax=ax, color='blue', markersize=5, alpha=0.5, label='Normal')
    plt.legend()
    plt.title('Spatial Outlier Detection')
    plt.show()