代码仓库
所有东西已经打包放到GitHhub,需要的小伙伴可以直接拉取。顺手可以帮忙点个Star!
https://github.com/turbo-duck/postgre-gis
背景介绍
我们有一批坐标点和一批区域点。
比如: 100万个点,和10万个区域块。
我们遇到了这样的需求:这100万个点,分别属于哪个区域块?
初始解决
通过空间的计算方法,我们用 O(n2)的方式:
# 伪代码 for poi in poi_list: for aoi in aoi_list: # 做一些空间计算 result = compute(aoi, poi) if result: # 在这个空间 save() else: # 不在这个空间 other()
但是这样的计算时间实在是太久,于是我们用空间数据库进行优化,于是有了下文:
从 O(n2) -> O(n),之前需要跑三天三夜的结果,现在只需要短短的几秒就可以跑完。
Docker
FROM postgis/postgis:latest ENV POSTGRES_DB=space ENV POSTGRES_USER=postgres ENV POSTGRES_PASSWORD=123123 EXPOSE 5432 COPY init-db.sh /docker-entrypoint-initdb.d/
init-db.sh
#!/bin/bash set -e psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL CREATE EXTENSION IF NOT EXISTS postgis; EOSQL psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL CREATE TABLE "public"."aoi" ( "name" varchar(500) COLLATE "pg_catalog"."default" NOT NULL, "geom" geometry(GEOMETRY) NOT NULL ); ALTER TABLE "public"."aoi" OWNER TO "postgres"; CREATE INDEX "geom" ON "public"."aoi" USING btree ( "geom" "public"."btree_geometry_ops" ASC NULLS LAST ); CREATE INDEX "name" ON "public"."aoi" USING btree ( "name" COLLATE "pg_catalog"."default" "pg_catalog"."text_ops" ASC NULLS LAST ); CREATE INDEX "name_geom" ON "public"."aoi" USING btree ( "name" COLLATE "pg_catalog"."default" "pg_catalog"."text_ops" ASC NULLS LAST, "geom" "public"."btree_geometry_ops" ASC NULLS LAST ); EOSQL psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL CREATE TABLE "public"."poi" ( "name" varchar(500) COLLATE "pg_catalog"."default" NOT NULL, "geom" geometry(GEOMETRY) NOT NULL ); ALTER TABLE "public"."poi" OWNER TO "postgres"; CREATE INDEX "geom_copy1" ON "public"."poi" USING btree ( "geom" "public"."btree_geometry_ops" ASC NULLS LAST ); CREATE INDEX "name_copy1" ON "public"."poi" USING btree ( "name" COLLATE "pg_catalog"."default" "pg_catalog"."text_ops" ASC NULLS LAST ); CREATE INDEX "name_geom_copy1" ON "public"."poi" USING btree ( "name" COLLATE "pg_catalog"."default" "pg_catalog"."text_ops" ASC NULLS LAST, "geom" "public"."btree_geometry_ops" ASC NULLS LAST ); EOSQL
写入 AOI
import pandas as pd import psycopg2 from openpyxl import Workbook import re wb = Workbook() ac = wb.active line = ['name', 'data', 'origin'] ac.append(line) # 是否出现错误 error_flag = 0 conn = psycopg2.connect(database="space", user="postgres", password="123123", host="localhost") cur = conn.cursor() aoi_file = pd.read_excel("./aoi.xlsx", engine='openpyxl') line_num = aoi_file.shape[0] # 清空表 clean_sql = "DELETE FROM aoi WHERE 1=1" cur.execute(clean_sql) conn.commit() for i in range(0, line_num): get_name = aoi_file.iat[i, 0] ''' {"type": "Polygon", "coordinates": [[[ 121.342079,31.417835],[121.342105,31.417812],[121.34214,31.417802], [121.342177,31.4178],[121.343274,31.417851],[121.343306,31.41786],[121.343333,31.417879], [121.343354,31.417911],[121.343364,31.417945],[121.343424,31.419663],[121.343423,31.419707], [121.343414,31.419746],[121.343384,31.419832],[121.343351,31.419903],[121.343318,31.419927], [121.343276,31.419931],[121.341537,31.419355],[121.341508,31.419332],[121.341486,31.419301], [121.341486,31.419268],[121.341496,31.41923],[121.342056,31.417871],[121.342079,31.417835 ]]]} ''' get_geom = aoi_file.iat[i, 1] origin_geom = get_geom get_geom = re.sub(' ', '', str(get_geom)) get_geom = re.sub('MultiPolygon', 'Polygon', str(get_geom)) get_geom = re.sub("\[\[\[\[", '[[[', str(get_geom)) get_geom = re.sub("\]\]\]\]", ']]]', str(get_geom)) get_geom = re.sub('\{', ' ', str(get_geom)) get_geom = re.sub('\}', ' ', str(get_geom)) get_geom = re.sub('"type":"Polygon"', "", str(get_geom)) get_geom = re.sub('"coordinates":\[\[\[', '', str(get_geom)) get_geom = re.sub('\]\]\]', '', str(get_geom)) get_geom = re.sub(' ', '', str(get_geom)) get_geom = re.sub(',', ' ', str(get_geom)) get_geom = re.sub('\] \[', ',', str(get_geom)) print(get_name) # INSERT INTO geometries VALUES( # 'B0FFG1YVX3', # 'MULTIPOLYGON(((120.394543 36.113915,120.39454 36.113898,120.394539 36.113869,120.394548 # 36.113844,120.394586 36.113812,120.396095 36.11299,120.396118 36.112986,120.396133 36.112988 # ,120.39615 36.112998,120.39712 36.114184,120.39713 36.114219,120.397129 36.114262,120.397118 36.114297 # ,120.397101 36.114325,120.397075 36.114359,120.397043 36.114389,120.396994 36.114425,120.39584 36.115065, # 120.395818 36.11507,120.395793 36.115071,120.395762 36.115068,120.395719 36.115058,120.395686 36.115043, # 120.395661 36.115029,120.394543 36.113915)))'); sql = "INSERT INTO aoi VALUES('" + str(get_name) + "'," + \ "'MULTIPOLYGON(((" + str(get_geom) + ")))');" try: cur.execute(sql) conn.commit() except Exception as e: error_flag = 1 conn.rollback() line = [str(get_name), str(get_geom), str(origin_geom)] ac.append(line) print(e) print(sql) conn.close() # 0 no error | 1 have error if error_flag == 1: try: wb.save("./error_data.xlsx") except PermissionError: wb.save("./new_error_data.xlsx")
写入 POI
import pandas as pd import psycopg2 import re conn = psycopg2.connect(database="space", user="postgres", password="123123", host="localhost") cur = conn.cursor() poi_file = pd.read_excel("./poi.xlsx", engine='openpyxl') line_num = poi_file.shape[0] # 清空表 clean_sql = "DELETE FROM poi WHERE 1=1" cur.execute(clean_sql) conn.commit() for i in range(0, line_num): get_name = poi_file.iat[i, 0] ''' 121.56042,31.20459 ''' get_poi = poi_file.iat[i, 1] origin_poi = get_poi get_poi = re.sub(" ", "", str(get_poi)) get_poi = re.sub(",", " ", str(get_poi)) get_poi = re.sub(",", " ", str(get_poi)) print(get_name) if str(get_name) == 'nan': continue # INSERT INTO geometries VALUES( # 'B0FFG1YVX3', # 'MULTIPOLYGON(((120.394543 36.113915,120.39454 36.113898,120.394539 36.113869,120.394548 # 36.113844,120.394586 36.113812,120.396095 36.11299,120.396118 36.112986,120.396133 36.112988 # ,120.39615 36.112998,120.39712 36.114184,120.39713 36.114219,120.397129 36.114262,120.397118 36.114297 # ,120.397101 36.114325,120.397075 36.114359,120.397043 36.114389,120.396994 36.114425,120.39584 36.115065, # 120.395818 36.11507,120.395793 36.115071,120.395762 36.115068,120.395719 36.115058,120.395686 36.115043, # 120.395661 36.115029,120.394543 36.113915)))'); sql = "INSERT INTO poi VALUES('" + str(get_name) + "'," + \ "'POINT(" + str(get_poi) + ")');" try: cur.execute(sql) except Exception as e: print(e) print(f"sql => {sql}") print(f"get_name => {get_name} | get_poi => {get_poi}") exit(0) conn.commit() conn.close()
判断 POI IN AOI
import psycopg2 from openpyxl import Workbook wb = Workbook() ac = wb.active line = ['poi_name', 'aoi_name'] ac.append(line) conn = psycopg2.connect(database="nyc", user="postgres", password="123123", host="localhost") cur = conn.cursor() # 建立索引 # sql = "CREATE INDEX aoi_idx ON aoi USING GIST (geom);" # cur.execute(sql) # conn.commit() # 查询结果 sql = "SELECT aoi.name, poi.name FROM aoi, poi where ST_Contains(aoi.geom, poi.geom);" print("正在判断···") cur.execute(sql) rows = cur.fetchall() for row in rows: result_aoi = row[0] result_poi = row[1] line = [str(result_poi), str(result_aoi)] ac.append(line) print(result_poi) conn.close() print("save xlsx") wb.save("./poiInAoiResult.xlsx") print("done")