Task

List and load spectral bands

# list files from a directory
library("terra")

Warning: package 'terra' was built under R version 4.3.2

terra 1.7.55

files = list.files("clustering_data/data/task", pattern = "\\.TIF$", full.names = TRUE)
files

[1] "clustering_data/data/task/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B1.TIF"
[2] "clustering_data/data/task/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B2.TIF"
[3] "clustering_data/data/task/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B3.TIF"
[4] "clustering_data/data/task/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B4.TIF"
[5] "clustering_data/data/task/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B5.TIF"
[6] "clustering_data/data/task/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B6.TIF"
[7] "clustering_data/data/task/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B7.TIF"

#Display metadata
# load raster data
landsat = rast(files)
landsat # calling the object displays the metadata

class       : SpatRaster 
dimensions  : 1946, 2052, 7  (nrow, ncol, nlyr)
resolution  : 30, 30  (x, y)
extent      : 562455, 624015, 5797635, 5856015  (xmin, xmax, ymin, ymax)
coord. ref. : WGS 84 / UTM zone 33N (EPSG:32633) 
sources     : LC08_L2SP_191023_20230605_20230613_02_T1_SR_B1.TIF  
              LC08_L2SP_191023_20230605_20230613_02_T1_SR_B2.TIF  
              LC08_L2SP_191023_20230605_20230613_02_T1_SR_B3.TIF  
              ... and 4 more source(s)
names       : LC08_~SR_B1, LC08_~SR_B2, LC08_~SR_B3, LC08_~SR_B4, LC08_~SR_B5, LC08_~SR_B6, ... 
min values  :           6,          21,        3531,        3123,        6811,        6857, ... 
max values  :       36332,       34628,       36371,       37924,       42025,       60674, ...

shorten and rename spectral bands

names(landsat) # original names

[1] "LC08_L2SP_191023_20230605_20230613_02_T1_SR_B1"
[2] "LC08_L2SP_191023_20230605_20230613_02_T1_SR_B2"
[3] "LC08_L2SP_191023_20230605_20230613_02_T1_SR_B3"
[4] "LC08_L2SP_191023_20230605_20230613_02_T1_SR_B4"
[5] "LC08_L2SP_191023_20230605_20230613_02_T1_SR_B5"
[6] "LC08_L2SP_191023_20230605_20230613_02_T1_SR_B6"
[7] "LC08_L2SP_191023_20230605_20230613_02_T1_SR_B7"

names(landsat) = paste0("B", 1:7) # shorten the names
names(landsat) # new names

[1] "B1" "B2" "B3" "B4" "B5" "B6" "B7"

Load vector data and transform CRS

# load vector data
poly = vect("clustering_data/data/task/Szamotuly.gpkg")
poly

 class       : SpatVector 
 geometry    : polygons 
 dimensions  : 1, 0  (geometries, attributes)
 extent      : 16.01377, 16.74077, 52.37375, 52.79406  (xmin, xmax, ymin, ymax)
 source      : Szamotuly.gpkg
 coord. ref. : lon/lat ETRS89 (EPSG:4258)

poly = project(poly, "EPSG:32633")
crs(poly, describe = TRUE)$code # check EPSG after transformation

[1] "32633"

Crop raster data to the area.

colors = gray.colors(n = 20) # define the color scheme
plot(landsat[[5]], main = "Szamotuly county", col = colors) # plot NIR band
plot(poly, add = TRUE) # add polygon

landsat = crop(landsat, poly, mask = TRUE)
plot(landsat[[5]], main = "Szamotuly county", col = colors)
plot(poly, add = TRUE)

Scale the values and remove outliers

summary(landsat)

Warning: [summary] used a sample

       B1              B2              B3              B4       
 Min.   : 5944   Min.   : 2420   Min.   : 7609   Min.   : 6939  
 1st Qu.: 7806   1st Qu.: 8046   1st Qu.: 8907   1st Qu.: 8441  
 Median : 8009   Median : 8252   Median : 9401   Median : 8835  
 Mean   : 8316   Mean   : 8613   Mean   : 9712   Mean   : 9452  
 3rd Qu.: 8601   3rd Qu.: 8902   3rd Qu.:10108   3rd Qu.: 9958  
 Max.   :19577   Max.   :20198   Max.   :21355   Max.   :22268  
 NA's   :51153   NA's   :51152   NA's   :51151   NA's   :51151  
       B5              B6              B7       
 Min.   : 7692   Min.   : 7322   Min.   : 7326  
 1st Qu.:15928   1st Qu.:11605   1st Qu.: 9348  
 Median :19210   Median :12471   Median : 9948  
 Mean   :19194   Mean   :13496   Mean   :11210  
 3rd Qu.:22071   3rd Qu.:14844   3rd Qu.:12193  
 Max.   :29478   Max.   :25647   Max.   :23481  
 NA's   :51151   NA's   :51151   NA's   :51151

landsat = landsat * 2.75e-05 - 0.2
summary(landsat)

Warning: [summary] used a sample

       B1              B2              B3              B4       
 Min.   :-0.04   Min.   :-0.13   Min.   :0.01    Min.   :-0.01  
 1st Qu.: 0.01   1st Qu.: 0.02   1st Qu.:0.04    1st Qu.: 0.03  
 Median : 0.02   Median : 0.03   Median :0.06    Median : 0.04  
 Mean   : 0.03   Mean   : 0.04   Mean   :0.07    Mean   : 0.06  
 3rd Qu.: 0.04   3rd Qu.: 0.04   3rd Qu.:0.08    3rd Qu.: 0.07  
 Max.   : 0.34   Max.   : 0.36   Max.   :0.39    Max.   : 0.41  
 NA's   :51153   NA's   :51152   NA's   :51151   NA's   :51151  
       B5              B6              B7       
 Min.   :0.01    Min.   :0.00    Min.   :0.00   
 1st Qu.:0.24    1st Qu.:0.12    1st Qu.:0.06   
 Median :0.33    Median :0.14    Median :0.07   
 Mean   :0.33    Mean   :0.17    Mean   :0.11   
 3rd Qu.:0.41    3rd Qu.:0.21    3rd Qu.:0.14   
 Max.   :0.61    Max.   :0.51    Max.   :0.45   
 NA's   :51151   NA's   :51151   NA's   :51151

Prepare the matrix for classification (remove empty values and sample).

landsat[landsat < 0] = NA
landsat[landsat > 1] = NA

plot1 = plotRGB(landsat, r = 4, g = 3, b = 2, scale = 1, stretch = "lin")

library("cluster")

mat = values(landsat)
nrow(mat) # print number of rows/pixels

[1] 2553992

mat_omit = na.omit(mat)
nrow(mat_omit)

[1] 1246565

set.seed(1)
mdl = kmeans(mat_omit, centers = 5)

mdl$centers

          B1         B2         B3         B4        B5        B6         B7
1 0.02166975 0.02913068 0.05963210 0.04659851 0.3635857 0.1548141 0.08215531
2 0.04527543 0.05493472 0.08671713 0.09044609 0.2969842 0.2398647 0.17131855
3 0.07720274 0.09108228 0.13243481 0.15542923 0.3087454 0.3347357 0.29259717
4 0.01932286 0.02474700 0.04490836 0.03620496 0.2066564 0.1240278 0.06635903
5 0.01300182 0.02149046 0.05689401 0.03608233 0.4637850 0.1186973 0.05500941

head(mdl$cluster) # display the first 6 elements

[1] 4 4 4 4 4 4

Use the k-means algorithm for clustering and validate the results with the silhouette index. Check how the results change depending on the number of clusters. Optionally, you can test another clustering method.

set.seed(1)
# draw sample indexes
idx = sample(1:nrow(mat_omit), size = 10000)
head(idx)

[1] 452737 124413 856018  25173 640775 538191

# calculate silhouette index
sil = silhouette(mdl$cluster[idx], dist(mat_omit[idx, ]))
summary(sil)

Silhouette of 10000 units in 5 clusters from silhouette.default(x = mdl$cluster[idx], dist = dist(mat_omit[idx, ])) :
 Cluster sizes and average silhouette widths:
     2405      1682      1041      2637      2235 
0.3322348 0.2783603 0.4524438 0.5989181 0.4629095 
Individual silhouette widths:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.1087  0.2861  0.4638  0.4352  0.5979  0.7465

colors = rainbow(n = 5)
plot(sil, border = NA, col = colors, main = "Silhouette Index")

Try to interpret what the clusters represent using a boxplot and RGB composition.

library("tidyr") # data transformation

Warning: package 'tidyr' was built under R version 4.3.2


Attaching package: 'tidyr'

The following object is masked from 'package:terra':

    extract

library("ggplot2") # data visualization

Warning: package 'ggplot2' was built under R version 4.3.2

stats = cbind(mat_omit[idx, ], cluster = mdl$cluster[idx])
stats = as.data.frame(stats)
head(stats)

         B1        B2        B3        B4        B5        B6        B7 cluster
1 0.0163700 0.0229425 0.0522025 0.0380675 0.3392750 0.1368750 0.0792075       1
2 0.0336675 0.0361700 0.0493150 0.0464825 0.1752375 0.1334375 0.0773100       4
3 0.0161500 0.0264900 0.0623225 0.0539625 0.3488725 0.1362975 0.0748900       1
4 0.0163975 0.0211275 0.0416150 0.0328150 0.1968800 0.1226025 0.0631475       4
5 0.0165075 0.0220900 0.0490950 0.0288825 0.4048075 0.1533475 0.0627350       1
6 0.0133175 0.0182950 0.0423850 0.0248950 0.4129750 0.1081650 0.0506075       5

stats = pivot_longer(stats, cols = 1:7, names_to = "band", values_to = "value")
# change the data type to factor
stats$cluster = as.factor(stats$cluster)
stats$band = as.factor(stats$band)
head(stats)

# A tibble: 6 × 3
  cluster band   value
  <fct>   <fct>  <dbl>
1 1       B1    0.0164
2 1       B2    0.0229
3 1       B3    0.0522
4 1       B4    0.0381
5 1       B5    0.339 
6 1       B6    0.137

ggplot(stats, aes(x = band, y = value, fill = cluster)) +
  geom_boxplot()

ggplot(stats, aes(x = band, y = value, fill = cluster)) +
  geom_boxplot(show.legend = FALSE) +
  scale_fill_manual(values = colors) +
  facet_wrap(vars(cluster)) +
  xlab("Spectral band") +
  ylab("Reflectance") +
  theme_light()

vec = rep(NA, ncell(landsat)) # prepare an empty vector
vec[complete.cases(mat)] = mdl$cluster # assign clusters to vector if not NA
clustering = rast(landsat, nlyrs = 1, vals = vec) # create raster

Present the results on a map and choose the appropriate color scheme.

colors = c("#91632b", "#086209", "#fdd327", "#d9d9d9", "#d20000")
category = c("barren", "forest", "cropland", "bare soil", "urban")
plot2 = plot(clustering, col = colors, type = "classes", levels = category)