Constructing the model in the 2D space
1. Compute hexagonal grid configurations
Number of bins along the x-axis
To begin the algorithm, you need to determine the number of bins along the x-axis for creating regular hexagons in the hexagonal grid.
num_bins_x <- calculate_effective_x_bins(.data = UMAP_data, x = UMAP1,
cell_area = 1)
num_bins_x
#> [1] 6
Shape parameter
Then, you need to determine the shape parameter, which control the shape and size of the hexagons in the hexagonal grid.
shape_value <- calculate_effective_shape_value(.data = UMAP_data,
x = UMAP1, y = UMAP2)
shape_value
#> [1] 2.019414
2. Obtain hexagonal bin centroids
hexbin_data_object <- extract_hexbin_centroids(nldr_df = UMAP_data,
num_bins = num_bins_x, shape_val = shape_value)
df_bin_centroids <- hexbin_data_object$hexdf_data
df_bin_centroids
#> # A tibble: 27 × 5
#> x y hexID counts std_counts
#> <dbl> <dbl> <int> <int> <dbl>
#> 1 -2.32 -5.74 2 4 0.667
#> 2 -2.79 -4.92 8 3 0.5
#> 3 -1.84 -4.92 9 5 0.833
#> 4 -3.27 -4.09 15 5 0.833
#> 5 -2.32 -4.09 16 1 0.167
#> 6 -1.36 -4.09 17 2 0.333
#> 7 -2.79 -3.26 22 2 0.333
#> 8 -1.84 -3.26 23 1 0.167
#> 9 -0.885 -3.26 24 2 0.333
#> 10 -0.407 -2.44 32 1 0.167
#> # ℹ 17 more rows
hexbin_object <- hexbin_data_object$hb_data
str(hexbin_object)
#> Formal class 'hexbin' [package "hexbin"] with 16 slots
#> ..@ cell : int [1:27] 2 8 9 15 16 17 22 23 24 32 ...
#> ..@ count : int [1:27] 4 3 5 5 1 2 2 1 2 1 ...
#> ..@ xcm : num [1:27] -2.37 -2.8 -1.8 -3.01 -2.76 ...
#> ..@ ycm : num [1:27] -5.61 -5.08 -5.03 -3.97 -4.06 ...
#> ..@ xbins : num 6
#> ..@ shape : num 2.02
#> ..@ xbnds : num [1:2] -3.27 2.46
#> ..@ ybnds : num [1:2] -5.74 5.82
#> ..@ dimen : num [1:2] 16 7
#> ..@ n : int 75
#> ..@ ncells: int 27
#> ..@ call : language hexbin::hexbin(x = dplyr::pull(nldr_df, { { ...
#> ..@ xlab : chr [1:5] "dplyr::pull(nldr_df, {" " {" " x" " }" ...
#> ..@ ylab : chr [1:5] "dplyr::pull(nldr_df, {" " {" " y" " }" ...
#> ..@ cID : int [1:75] 15 54 62 2 23 104 54 39 54 97 ...
#> ..@ cAtt : int(0)
3. Remove low-density hexagons
## To identify low density hexagons
df_bin_centroids_low <- df_bin_centroids |>
dplyr::filter(std_counts <= 0.1666667)
## To identify low-density hexagons needed to remove by investigating neighbouring mean density
identify_rm_bins <- find_low_density_hexagons(df_bin_centroids_all = df_bin_centroids, num_bins_x = num_bins_x, df_bin_centroids_low = df_bin_centroids_low)
## To remove low-density hexagons
df_bin_centroids <- df_bin_centroids |>
dplyr::filter(!(hexID %in% identify_rm_bins))
4. Triangulate hexagonal bin centroids
Next, you need to perform triangulation on the bin centroids to construct a triangular mesh. Triangulation involves connecting the bin centroids with triangular edges to form a mesh that reveals the local structure of the data.
tr1_object <- triangulate_bin_centroids(.data = df_bin_centroids, x = x, y = y)
str(tr1_object)
#> List of 10
#> $ n : int 26
#> $ x : num [1:26] -2.32 -2.79 -1.84 -3.27 -2.32 ...
#> $ y : num [1:26] -5.74 -4.92 -4.92 -4.09 -4.09 ...
#> $ tlist: int [1:144] 3 2 1 3 2 -1 -4 4 5 1 ...
#> $ tlptr: int [1:144] 2 7 4 9 6 18 1 3 8 11 ...
#> $ tlend: int [1:26] 7 4 6 91 17 20 45 30 36 41 ...
#> $ tlnew: int 131
#> $ nc : num 0
#> $ lc : num 0
#> $ call : language tripack::tri.mesh(x = dplyr::pull(.data, { { ...
#> - attr(*, "class")= chr "tri"
tr_from_to_df <- generate_edge_info(triangular_object = tr1_object)
tr_from_to_df
#> # A tibble: 65 × 6
#> from to x_from y_from x_to y_to
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 3 -2.32 -5.74 -1.84 -4.92
#> 2 1 2 -2.32 -5.74 -2.79 -4.92
#> 3 2 5 -2.79 -4.92 -2.32 -4.09
#> 4 2 3 -2.79 -4.92 -1.84 -4.92
#> 5 3 6 -1.84 -4.92 -1.36 -4.09
#> 6 4 5 -3.27 -4.09 -2.32 -4.09
#> 7 4 7 -3.27 -4.09 -2.79 -3.26
#> 8 5 6 -2.32 -4.09 -1.36 -4.09
#> 9 6 14 -1.36 -4.09 1.02 0.0427
#> 10 6 9 -1.36 -4.09 -0.407 -2.44
#> # ℹ 55 more rows
## To draw the traingular mesh
trimesh <- ggplot(df_bin_centroids, aes(x = x, y = y)) +
geom_trimesh() +
coord_equal() +
xlab(expression(C[x]^{(2)})) + ylab(expression(C[y]^{(2)})) +
theme(axis.text = element_text(size = 5),
axis.title = element_text(size = 7))
trimesh
5. Remove long edges
## Compute 2D distances
distance <- cal_2d_dist(.data = tr_from_to_df)
## To find the benchmark value to remove long edges
benchmark <- find_benchmark_value(.data = distance, distance_col = distance)
benchmark
#> [1] 1.653
## To draw the traingular mesh after remove long edges in 2D
trimesh_removed <- remove_long_edges(.data = distance, benchmark_value = benchmark,
triangular_object = tr1_object, distance_col = distance) +
xlab(expression(C[x]^{(2)})) + ylab(expression(C[y]^{(2)})) +
theme(axis.text = element_text(size = 5),
axis.title = element_text(size = 7))
trimesh_removed
Lifting the model into high dimensions
To extend the model created in the 2D space to higher dimensions, you need to take the average of the high-dimensional coordinates within hexagonal bins.
## Add hexbin Ids for the 2D embeddings
UMAP_data_with_hb_id <- UMAP_data |>
dplyr::mutate(hb_id = hexbin_data_object$hb_data@cID)
UMAP_data_with_hb_id
#> # A tibble: 75 × 4
#> UMAP1 UMAP2 ID hb_id
#> <dbl> <dbl> <int> <int>
#> 1 -2.81 -3.91 1 15
#> 2 0.959 -0.00271 2 54
#> 3 1.54 0.462 3 62
#> 4 -2.31 -5.50 4 2
#> 5 -1.76 -3.46 6 23
#> 6 1.53 5.75 7 104
#> 7 0.930 -0.175 8 54
#> 8 0.319 -1.61 9 39
#> 9 1.37 0.0541 11 54
#> 10 1.90 4.94 12 97
#> # ℹ 65 more rows
## To generate a data set with high-dimensional training data and 2D embeddings with hexagonal IDs
df_all <- dplyr::bind_cols(training_data |> dplyr::select(-ID), UMAP_data_with_hb_id)
df_all
#> # A tibble: 75 × 11
#> x1 x2 x3 x4 x5 x6 x7 UMAP1 UMAP2
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.120 0.114 -1.99 -0.00246 -0.0178 -0.0181 -3.17e-3 -2.81 -3.91
#> 2 -0.0492 0.822 0.00121 0.0161 0.00968 -0.0834 2.30e-3 0.959 -0.00271
#> 3 -0.774 0.243 0.367 -0.0198 0.00408 -0.0349 -9.11e-3 1.54 0.462
#> 4 -0.606 1.96 -1.80 0.0132 -0.000479 -0.00478 -8.43e-3 -2.31 -5.50
#> 5 0.818 0.0388 -1.58 0.00253 0.00167 0.0781 -7.71e-3 -1.76 -3.46
#> 6 0.910 1.55 1.42 0.0124 0.0160 -0.00248 -8.32e-3 1.53 5.75
#> 7 -0.0691 0.978 0.00239 0.0115 0.00350 0.0898 3.59e-3 0.930 -0.175
#> 8 0.859 1.55 -0.488 -0.00753 -0.0123 0.0336 -6.65e-3 0.319 -1.61
#> 9 -0.0400 0.286 0.000801 0.0123 0.00613 -0.0121 -3.47e-4 1.37 0.0541
#> 10 0.765 0.898 1.64 -0.0178 0.0151 -0.0710 -6.24e-3 1.90 4.94
#> # ℹ 65 more rows
#> # ℹ 2 more variables: ID <int>, hb_id <int>
df_bin <- avg_highD_data(.data = df_all)
df_bin
#> # A tibble: 27 × 8
#> hb_id x1 x2 x3 x4 x5 x6 x7
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 -0.637 1.74 -1.76 0.00953 -0.00143 -0.0117 -0.00152
#> 2 8 -0.727 1.07 -1.43 0.00721 0.0138 -0.0529 -0.000473
#> 3 9 0.171 1.56 -1.93 0.00613 -0.00726 -0.0212 -0.00198
#> 4 15 -0.424 0.196 -1.86 -0.00854 -0.00598 -0.0195 0.00165
#> 5 16 -0.186 0.532 -1.98 0.0189 0.00972 -0.0112 -0.00900
#> 6 17 0.603 1.00 -1.77 0.0158 -0.00995 0.00976 0.000755
#> 7 22 0.0548 0.0438 -1.96 0.00858 0.0115 -0.0317 -0.00197
#> 8 23 0.818 0.0388 -1.58 0.00253 0.00167 0.0781 -0.00771
#> 9 24 0.943 0.757 -1.32 -0.00720 -0.00319 0.0572 0.00230
#> 10 32 0.988 0.673 -0.848 0.0177 -0.00890 0.0114 0.00460
#> # ℹ 17 more rows
Visualize the model and high-dimensional data in the high-dimensional space
To visualize the model and the high-dimensional data, tour technique is used.
tour1 <- show_langevitour(df_all, df_bin, df_bin_centroids, benchmark_value = benchmark,
distance = distance, distance_col = 'distance')
tour1