# Reading the greenspace_site table and filtering relevant objects
df_greenspaces_bronze = spark.table("geospatial.greenspaces.greenspace_site") \
.filter("function IN ('Play Space', 'Playing Field', 'Public Park Or Garden')")
df_greenspaces_bronze.createOrReplaceTempView("greenspace_site_bronze_vw")
# Finding the small playgrounds inside larger parks
greenspace_site_covered = spark.sql("""
SELECT
g1.id AS g1_id,
g2.id AS g2_id,
g1.function AS g1_function,
g2.function AS g2_function,
g2.distinctive_name_1 AS g2_name,
g2.geometry AS geometry,
ST_Geohash(ST_Transform(g2.geometry ,'epsg:27700','epsg:4326'), 5) AS geohash
FROM greenspace_site_bronze_vw g1
INNER JOIN greenspace_site_bronze_vw g2
ON ST_CoveredBy(g1.geometry, g2.geometry)
AND g1.id != g2.id
""").repartitionByRange(10, "geohash")
greenspace_site_covered.createOrReplaceTempView("greenspace_site_covered_vw")
# Aggrgating the small playgrounds in the larger parks
greenspace_site_aggregated = spark.sql("""
SELECT
g2_id AS id,
concat_ws(', ', any_value(g2_function), collect_set(g1_function)) AS functions,
count(*) + 1 AS num_functions,
g2_name AS name,
ST_Area(geometry) AS area,
geometry,
ST_Geohash(ST_Transform(geometry,'epsg:27700','epsg:4326'), 5) AS geohash
FROM greenspace_site_covered_vw
GROUP BY g2_id, g2_name, geometry
""").repartitionByRange(10, "geohash")
greenspace_site_aggregated.createOrReplaceTempView("greenspace_site_aggregated_vw")
# Find the parks without any smaller playgrounds inside them
greenspace_site_non_covered = spark.sql("""
SELECT id, function, 1 AS num_functions, distinctive_name_1 AS name, ST_Area(geometry) AS area, geometry,
ST_Geohash(ST_Transform(geometry,'epsg:27700','epsg:4326'), 5) AS geohash
FROM greenspace_site_bronze_vw
WHERE id NOT IN (SELECT g1_id FROM greenspace_site_covered_vw)
AND id NOT IN (SELECT g2_id FROM greenspace_site_covered_vw)
""").repartitionByRange(10, "geohash")
greenspace_site_non_covered.createOrReplaceTempView("greenspace_site_non_covered_vw")
# Union the above two dataframes
greenspace_site_all = spark.sql("""
SELECT * FROM greenspace_site_aggregated_vw
UNION
SELECT * FROM greenspace_site_non_covered_vw""").repartitionByRange(10, "geohash")
# Calculate 0%, 20%, 40%, 60%, 80%, 100% quantiles
quantiles = greenspace_site_all.approxQuantile("area", [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], 0.001)
print("Quintile breakpoints:", quantiles)
q0, q20, q40, q60, q80, q100 = quantiles
# Categorize each park based on its area
greenspace_site_all = greenspace_site_all.withColumn(
"area_category",
F.when(F.col("area") <= q20, 20)
.when(F.col("area") <= q40, 40)
.when(F.col("area") <= q60, 60)
.when(F.col("area") <= q80, 80)
.otherwise(100)
)
display(greenspace_site_all.groupBy("area_category").count().orderBy("area_category"))
greenspace_site_all.createOrReplaceTempView("greenspace_site_all_vw")
# In the final step, we aim to identify the county each park falls within.
greenspace_site_silver = spark.sql("""
WITH tmp AS (
SELECT a.id, a.functions, a.num_functions, a.name, a.area, a.area_category, a.geometry, a.geohash,
RANK() OVER(PARTITION BY a.id ORDER BY ST_Area(ST_Intersection(a.geometry, b.geometry)) DESC) AS administrative_rank,
b.fid as administrative_fid
FROM greenspace_site_all_vw a
INNER JOIN administrative_boundaries_vw b
ON ST_Intersects(a.geometry, b.geometry)
)
SELECT tmp.id, tmp.functions, tmp.num_functions, tmp.name, tmp.area, tmp.area_category, tmp.administrative_fid, tmp.geometry, tmp.geohash
FROM tmp
WHERE administrative_rank = 1
""").repartitionByRange(10, "geohash")
greenspace_site_silver.createOrReplaceTempView("greenspace_site_silver_vw")
# Write the dataframe into the correspoding silver Delta Table
greenspace_site_silver.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(f"geospatial.greenspaces.greenspace_site_silver")