#install.packages('igraph')
#install.packages('tidyverse')
#install.packages('kableExtra')
#install.packages('IsingFit')
#install.packages('NetworkToolbox')
#install.packages('bootnet')Network_Analysis_Week_8
Network Analysis
library(igraph)
library(tidyverse)
library(kableExtra)
library(IsingFit)
library(NetworkToolbox)
library(bootnet)
library(ggraph)
library(tidygraph)
library(psych)
library(patchwork)A network is made up of nodes and edges.
Nodes (vertices) often represent people, places, or things. They are the discrete entities being represented in the network.
Edges represent relationships or links between nodes.
Networks are also known as graphs and are represented by G(N, E)
#Creating a simple network
set.seed(3)
#from igraph package
g <- sample_gnp(6, .5) #Network with 6 nodes and a .5 probability of drawing an edge between two arbitrary nodes
#We can also indicate if our network is directed and if it has loops
l <- layout_with_fr(g) #Defines the layout using the Fruchterman-Reingold algorithm
plot(g,
vertex.label = c(1:6),
vertex.label.cex = 1, #size of text
vertex.color = "green2",
layout = l #We used the defined layout so that node positions stay the same every time we plot it
)Representing Networks
We can represent the relationship between nodes and edges in two distinct ways: as an edge list or an adjacency matrix
An edge list lists each relationship, having a row for every edge and listing the names of the nodes that share it in two columns. The edge list is not in any particular order from top to bottom
#Creating an edge list
edgel <- igraph::as_edgelist(g)
edgel [,1] [,2]
[1,] 1 4
[2,] 3 4
[3,] 1 5
[4,] 4 5
[5,] 1 6
[6,] 2 6
[7,] 3 6
[8,] 4 6
#kable(edgel,
#col.names = c("V1", "V2"),
#caption = "Edgelist")
#Transforming it back to network
backnet_el <- igraph::graph_from_edgelist(edgel, directed = FALSE)
plot(backnet_el,
vertex.label = c(1:6),
vertex.color = "green",
layout = l)Adjacency matrices are square matrices where both the rows and columns represent nodes in the network. Each cell in the matrix tells us whether there is an edge between two nodes. Edges are represented by nonzero values. In this case since we have an unweighted network the presence of an edge is represented by 1. Since our network is also undirected, the adjacency matrix is symmetric. The diagonal represents self-loops.
#Creating an adjacency matrix
am <- igraph::as_adjacency_matrix(g, sparse = FALSE) #Sparse is set to FALSE so that zero values (the majority) are stored.
#Defining the row and column names
rownames(am) <- 1:6
colnames(am) <- 1:6
am 1 2 3 4 5 6
1 0 0 0 1 1 1
2 0 0 0 0 0 1
3 0 0 0 1 0 1
4 1 0 1 0 1 1
5 1 0 0 1 0 0
6 1 1 1 1 0 0
#kable(am,
#caption = "Adjacency matrix",
#row.names = TRUE)
backnet_am <- igraph::graph_from_adjacency_matrix(am, mode = "undirected")
plot(backnet_am,
vertex.label = c(1:6),
vertex.color = "green",
layout = l)Nodes can also have self-loops, or an edge with itself.
#Loops
gloop <- igraph::add_edges(g, c(2, 2, 2, 3)) #Adding two edges between 2 and two (a self-loops) and between 2 and 3
plot(gloop,
vertex.label = c(1:6),
vertex.color = "green",
layout = l)amloop <- igraph::as_adjacency_matrix(gloop, sparse = FALSE)
rownames(amloop) <- 1:6
colnames(amloop) <- 1:6
amloop 1 2 3 4 5 6
1 0 0 0 1 1 1
2 0 1 1 0 0 1
3 0 1 0 1 0 1
4 1 0 1 0 1 1
5 1 0 0 1 0 0
6 1 1 1 1 0 0
#kable(amloop,
# caption = "Adjacency matrix loop g",
# row.names = TRUE)
#Removing loops
gloopless <- igraph::simplify(gloop, remove.loops = TRUE)
plot(gloopless,
vertex.label = c(1:6),
vertex.color = "green",
layout = l,
edge.loop.angle = .90)#Removing edges
original_g <- igraph::delete_edges(gloopless, igraph::get.edge.ids(gloopless, c(2, 3)))
plot(original_g,
vertex.label = c(1:6),
vertex.color = "green",
layout = l)Types of networks
There are different types of networks. The network we’ve been working with was unweighted and undirected (i.e., a simple network).
Weighted networks
In weighted networks, the edges can take values different from 0 or 1.
#Weighted networks
set.seed(14)
gw <- g
number_of_edges <- length(igraph::E(gw))
random_weights <- round(runif(number_of_edges, 1, 10))
igraph::E(gw)$weight <- random_weights #Assigning the weight numbers as the weight attribute of each edge
igraph::E(gw)$label <- igraph::E(gw)$weight #Copies the weight attributes to the edge labels
plot(gw,
layout = l,
edge.width = E(gw)$weight,
vertex.label = c(1:6),
vertex.color = "white")Directed networks
When the edges point from one point to another, the network is directed.
#Directed network
edge_list <- as_edgelist(g)
edge_list <- as.matrix(edge_list) #We tranform from a data frame into a matrix given that is what the graph_from_edgelist function (used below) requires
edge_list <- rbind(edge_list, c(4, 1)) #Adding an extra row to the edge list
edge_list [,1] [,2]
[1,] 1 4
[2,] 3 4
[3,] 1 5
[4,] 4 5
[5,] 1 6
[6,] 2 6
[7,] 3 6
[8,] 4 6
[9,] 4 1
directed_graph <- graph_from_edgelist(edge_list, directed = TRUE) #Indicates we want a directed graph
plot(directed_graph,
layout = l,
vertex.label = c(1:6),
vertex.color = "white")#Weighted directed graph
wdg <- directed_graph
E(wdg)$weight <- c(random_weights, 5) #Assigning the previous wegihts, plus 5 to the newly created edge
E(wdg)$label <- E(wdg)$weight
plot(wdg,
layout = l,
vertex.label = c(1:6),
vertex.color = "white",
edge.width = E(wdg)$weight * .6,
vertex.size = 20)#Using ggraph
plot_unweighted_undirected <- ggraph(as_tbl_graph(g), layout = "manual", x = l[,1], y = l[,2]) + #making it ggraph compatilbe and using the coordinates from l
geom_edge_link() + #straight edges between nodes
geom_node_point(size = 10, fill = "green", shape = 21, stroke = 1.5) + #Nodes size, color, shape, and border thickness
geom_node_text(aes(label = 1:6)) +
ggtitle("Unweighted, Undirected") +
theme_graph()
plot_weighted_undirected <- ggraph(as_tbl_graph(gw), layout = "manual", x = l[,1], y = l[,2]) +
geom_edge_link(aes(width = weight), alpha = 0.8) + #maps edge thickness to the weight attribute
scale_edge_width(range = c(0.5, 3)) + #Constraining edge thickness
geom_node_point(size = 10, fill = "green", shape = 21, stroke = 1.5) +
geom_node_text(aes(label = 1:6)) +
ggtitle("Weighted, Undirected") +
theme_graph()
plot_unweighted_directed <- ggraph(as_tbl_graph(directed_graph), layout = "manual", x = l[,1], y = l[,2]) +
geom_edge_link(arrow = arrow(length = unit(4, "mm")), end_cap = circle(5, "mm")) + #adding arrows and making them visually compatible with the nodes
geom_node_point(size = 10, fill = "green", shape = 21, stroke = 1.5) +
geom_node_text(aes(label = 1:6)) +
ggtitle("Unweighted, Directed") +
theme_graph()
plot_weighted_directed <- ggraph(as_tbl_graph(wdg), layout = "manual", x = l[,1], y = l[,2]) +
geom_edge_link(aes(width = weight), alpha = 0.8,
arrow = arrow(length = unit(4, "mm")), end_cap = circle(5, "mm")) +
scale_edge_width(range = c(0.5, 3)) +
geom_node_point(size = 10, fill = "green", shape = 21, stroke = 1.5) +
geom_node_text(aes(label = 1:6)) +
ggtitle("Weighted, Directed") +
theme_graph()
library(patchwork)
(plot_unweighted_undirected + plot_weighted_undirected) /
(plot_unweighted_directed + plot_weighted_directed)Let’s use some data
Hunter x Hunter is a manga series (1998-ongoing) and anime (1998 and 2011) that features the story of Gon, a 12-year old who wants to become a hunter and find his dad (who is a hunter) who abandoned him when he was little. The latest animated version (2011-2014) consists of 148 episodes and 7 arcs. The current dataset includes two files: an edge list representing which of the 67 characters featured in the 2011 anime have directly interacted with each other, and a data frame containing the names of the 67 characters with one column per arc (0 = a given character did not appear in the arc, 1 = a given character did appear in the arc).
#Hunter X Hunter Dataset
hxh_edgelist <- read_csv("https://www.dropbox.com/scl/fi/fiee4f7v4qeficzoxzyfq/HxH_edge.csv?rlkey=l8pb00ogcph3abn631th5qr81&st=c5076lzz&dl=1")
hxh_characters <- read_csv("https://www.dropbox.com/scl/fi/9dw3sv0jp4pkvngc0ai11/HxH_characters.csv?rlkey=1uvohop67u6tu8hoz9ukdrfps&st=6cke0i07&dl=1")hxh_matrix <- as.matrix(hxh_edgelist)
# Creating the network
hxh_g <- igraph::graph_from_edgelist(hxh_matrix, directed = FALSE)
# Layout
hxh_layout <- igraph::layout_with_fr(hxh_g)
#ggplot
hxh_ggplot <- ggraph(as_tbl_graph(hxh_g), layout = "manual", x = hxh_layout[,1], y = hxh_layout[,2]) +
geom_edge_link(alpha = 0.3) +
geom_node_point(size = 14, fill = "gray", shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 2) +
ggtitle("Hunter x Hunter Network") +
theme_graph()
hxh_ggplotNetwork Metrics
Beyond just being cool visual representations of information, networks can be quantified. Network metrics allow us to measure the structural properties of a network at different scales:
Micro: Structural properties of individual nodes and edges
Meso: Structural properties of local communities or subgraphs
Macro: Structural properties of entire networks
Below is a list of the network metrics that will be reviewing today
#Measures
labels = c("$N$", "$E$", "$\\rho$", "$L$", "$D$", "$C$", "$k$", "$b$", "$c$", "$x$", "$Q$")
Definitions =c("Number of nodes",
"Number of edges",
"Density",
"Average shortest path length",
"Diameter",
"Clustering Coefficient",
"Degree",
"Betweenness centrality",
"Closeness centrality",
"Eigenvector centrality",
"Modularity"
)
# make a data frame from lists
dt = data.frame("Variable" =labels, "Definition" = Definitions)
# make table for data frame
kable(dt, booktabs = F, escape = FALSE, caption = "Basic network measures") | Variable | Definition |
|---|---|
| \(N\) | Number of nodes |
| \(E\) | Number of edges |
| \(\rho\) | Density |
| \(L\) | Average shortest path length |
| \(D\) | Diameter |
| \(C\) | Clustering Coefficient |
| \(k\) | Degree |
| \(b\) | Betweenness centrality |
| \(c\) | Closeness centrality |
| \(x\) | Eigenvector centrality |
| \(Q\) | Modularity |
Density
Density is the number of observed edges divided by the number of possible edges or the probability of selecting a random pair of nodes and finding an edge between them. For an undirected network this is:
\[\rho = \frac{2E}{N(N-1)}\]
where E is the number of edges and N is the number of nodes.
Density ranges from 0 to 1. A network with a density of 0 would have no edges connecting its nodes, whereas all the nodes would share an edge with each other if the density was 1 (this is known as a clique).
hxh_ggplot# Number of nodes
n_nodes <- vcount(hxh_g)
# Number of edges
n_edges <- ecount(hxh_g)
# Density
hxh_g_density <- edge_density(hxh_g)
cat("Number of nodes:", n_nodes, "\n")Number of nodes: 67
cat("Number of edges:", n_edges, "\n")Number of edges: 480
cat("Density:", round(hxh_g_density, 3), "\n")Density: 0.217
#Calculating density manually
manual_density <- round(2 * (n_edges) / (n_nodes*(n_nodes - 1)), 3)
manual_density[1] 0.217
Paths and path length
The path length between two nodes is the number of edges that need to be traversed to get from node i to node j
Geodesic distance (d): the minimum number of edges needed to travel between two nodes (micro level)
Average shortest path length (L) : is the average of all the geodesic distances between all pairs of nodes that have a finite path length. (meso and macro level)
Diameter (D): is the largest geodesic distance
There measures can be great to measure connectivity between a network
#Geodisic distance
geodesic_distance <- distances(hxh_g, v = "Gyro", to = "Zushi")
cat("Geodesic distance between Gyro and Zushi:", geodesic_distance, "\n")Geodesic distance between Gyro and Zushi: 4
# Average shortest path length
avg_path_length <- round(mean_distance(hxh_g, directed = FALSE), 3)
cat("Average shortest path length:", avg_path_length, "\n")Average shortest path length: 2.024
# Diameter
hxh_diameter <- diameter(hxh_g, directed = FALSE)
cat("Diameter:", hxh_diameter, "\n")Diameter: 4
Centrality
We care about centrality when we want to know more about the structural properties of nodes. Understanding how nodes differ along different centrality measures helps us understand the importance of specific nodes and the diversity within our network.
Degree
The most basic centrality measure is degree
Degree: number of edges a node has to other nodes
#Degree
# Degree of Gon, Meruem, and Zushi
gon_degree <- igraph::degree(hxh_g, v = "Gon Freecs")
meruem_degree <- igraph::degree(hxh_g, v = "Meruem")
zushi_degree <- igraph::degree(hxh_g, v = "Zushi")
cat("Gon degree:", gon_degree, "\n")Gon degree: 44
cat("Meruem degree:", meruem_degree, "\n")Meruem degree: 19
cat("Zushi degree:", zushi_degree, "\n")Zushi degree: 6
# Character with highest and lowest degree
all_degrees <- igraph::degree(hxh_g)
highest_degree_character <- names(which.max(all_degrees))
lowest_degree_character <- names(which.min(all_degrees))
cat("Highest degree character:", highest_degree_character, "with degree", max(all_degrees), "\n")Highest degree character: Killua Zoldyck with degree 49
cat("Lowest degree character:", lowest_degree_character, "with degree", min(all_degrees), "\n")Lowest degree character: Gyro with degree 2
# Top 3 highest degree characters
top_3 <- sort(all_degrees, decreasing = TRUE)[1:3]
cat("Top 3 highest degree characters:\n")Top 3 highest degree characters:
print(top_3)Killua Zoldyck Gon Freecs Hisoka
49 44 29
# Bottom 3 lowest degree characters
bottom_3 <- sort(all_degrees, decreasing = TRUE)[67:65]
cat("Bottom 3 lowest degree characters:\n")Bottom 3 lowest degree characters:
print(bottom_3) Gyro Genthru Melody
2 3 4
ggraph(as_tbl_graph(hxh_g), layout = "manual", x = hxh_layout[,1], y = hxh_layout[,2]) +
geom_edge_link(alpha = 0.3) +
geom_node_point(aes(size = all_degrees), fill = "gray", shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 2) + #all_degrees
scale_size(range = c(3, 15)) +
ggtitle("Hunter x Hunter Network Degree") +
theme_graph() +
theme(legend.position = "none")Additionally, we can also calculate the indegree and the outdegree when we have directed networks
The indegree is the number of edges directed to a given node.
The outdegree is the numbers of edges coming from a given node.
#Another hxh edgelist, this time with the edge representing an altercation, between two characters.
hxh_fights <- read_csv("https://www.dropbox.com/scl/fi/grtbmtfq4zq7tpi44ic1v/HxH_fights.csv?rlkey=v91qkvt7w8t0m2z68e7z0icqq&st=13yocyp0&dl=1")
#head(hxh_fights)
#str(hxh_fights)
fights_dnet <- igraph::graph_from_data_frame(hxh_fights[, 1:2], directed = TRUE)
directed_layout <- igraph::layout_with_fr(fights_dnet)
#Plot
fights_dnetwork <- ggraph(as_tbl_graph(fights_dnet), layout = "manual", x = directed_layout[,1], y = directed_layout[,2]) +
geom_edge_link(arrow = arrow(length = unit(3, "mm")),
end_cap = circle(6, "mm"), alpha = 0.5) +
geom_node_point(size = 14, fill = "gray", shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 2.5) +
ggtitle("HxH Fights — Directed Network") +
theme_graph()
fights_dnetwork# Indegree and outdegree for hxh fights graph
indegree_directed <- igraph::degree(fights_dnet, mode = "in")
outdegree_directed <- igraph::degree(fights_dnet, mode = "out")
cat("Directed Unweighted Network\n")Directed Unweighted Network
cat("Character with the most losses:", names(which.max(indegree_directed)),"with", max(indegree_directed), "\n")Character with the most losses: Chrollo with 3
cat("Character with the most wins:", names(which.max(outdegree_directed)), "with outdegree", max(outdegree_directed), "\n")Character with the most wins: Hisoka with outdegree 9
#Plotting indegrees
fights_dnetwork_outdegree <- ggraph(as_tbl_graph(fights_dnet), layout = "manual", x = directed_layout[,1], y = directed_layout[,2]) +
geom_edge_link(arrow = arrow(length = unit(3, "mm")),
end_cap = circle(6, "mm"), alpha = 0.5) +
geom_node_point(aes(size = outdegree_directed), fill = "gray", shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 2) +
scale_size(range = c(5, 18)) +
ggtitle("HxH Fights Kills") +
theme_graph() +
theme(legend.position = "none")
fights_dnetwork_outdegreeClustering
Clustering coefficients measure the extent to which a node’s neighbors are themselves neighbors. We can measure it at the level of individual nodes or at the network level.
The local clustering coefficient measure the proportion of node’s neighbors that are neighbors of each other. It ranges from 0 to 1. If all it’s neighbors are connected it is equal to 1, if none of if neighbors are connected it is equal to 0.
\[C_i = \frac{2e_i}{k_i(k_i-1)}\]
Mathematically, the clustering coefficient of a given node i, is equal to the observed number of edges, e, between its k neighbors over the number of possible edges between its k neighbors.
We can get these coefficient for individual nodes or for the whole network.
Another clustering measure, apparently preferable at the graph level, is transitivity. Transitivity measures the proportion of all connected triplets in the network that form closed triangles.
\[T = \frac{3 \times \text{transitive triplets}}{\text{intransitive triplets }}\]
As we can see in the image, this measures can differ. The majority of individual nodes may have neighbors that are connected, even though the majority of triplets are non-transitive.
# Local clustering coefficient for specific characters
gon_lcc <- igraph::transitivity(hxh_g, type = "local", vids = which(V(hxh_g)$name == "Gon Freecs"))
#In the type argument we indicate the type of the transitivity to calculate
#In the vids aregument we indicate the ids of the vertices
killua_cc <- igraph::transitivity(hxh_g, type = "local", vids = which(V(hxh_g)$name == "Killua Zoldyck"))
zushi_cc <- igraph::transitivity(hxh_g, type = "local", vids = which(V(hxh_g)$name == "Zushi"))
cat("Local clustering coefficient: \n")Local clustering coefficient:
cat("Gon Freecs:", round(gon_lcc, 3), "\n")Gon Freecs: 0.235
cat("Killua Zoldyck:", round(killua_cc, 3), "\n")Killua Zoldyck: 0.23
cat("Zushi:", round(zushi_cc, 3), "\n")Zushi: 0.8
# Average local clustering coefficient for the whole network
avg_clustering <- igraph::transitivity(hxh_g, type = "average")
cat("\nAverage local clustering coefficient:", round(avg_clustering, 3), "\n\n")
Average local clustering coefficient: 0.788
# Transitivity
transitivity <- igraph::transitivity(hxh_g, type = "global")
cat("Transitivity:", round(transitivity, 3), "\n")Transitivity: 0.614
Betweenness
Betweenness centrality measures how often a node lies on the shortest path between two other nodes.
Mathematically, for a node i:
\[b_i = \sum_{i \neq j \neq k} \frac{\omega_{jk}(i)}{\omega_{jk}}\]
where w_jk(i) is the number of shortest paths between nodes j and k that pass through node i, and w_jk is the total number of shortest paths between j and k
gon_bw <- igraph::betweenness(hxh_g, v = "Gon Freecs", normalized = TRUE)
killua_bw <- igraph::betweenness(hxh_g, v = "Killua Zoldyck", normalized = TRUE)
zushi_bw <- igraph::betweenness(hxh_g, v = "Hisoka", normalized = TRUE)
cat("Betweenness centrality:\n")Betweenness centrality:
cat("Gon Freecs:", round(gon_bw, 3), "\n")Gon Freecs: 0.213
cat("Killua Zoldyck:", round(killua_bw, 3), "\n")Killua Zoldyck: 0.3
cat("Hisoka:", round(zushi_bw, 3), "\n")Hisoka: 0.037
#Network betweenness
all_bw <- igraph::betweenness(hxh_g, normalized = TRUE)
bw_df <- data.frame(
character = V(hxh_g)$name,
betweenness = round(all_bw, 3),
row.names = NULL) %>%
arrange(desc(betweenness))
bw_df character betweenness
1 Killua Zoldyck 0.300
2 Gon Freecs 0.213
3 Netero 0.060
4 Meleoron 0.040
5 Hisoka 0.037
6 Shaiapouf 0.032
7 Rammot 0.032
8 Ilumi Zoldyck 0.030
9 Meruem 0.028
10 Zeno Zoldyck 0.026
11 Neferpitou 0.023
12 Morel 0.023
13 Youpi 0.023
14 Kalluto Zoldyck 0.019
15 Leorio 0.018
16 Feitan 0.016
17 Ikalgo 0.015
18 Chimera Queen 0.015
19 Welfin 0.015
20 Zazan 0.011
21 Kurapika 0.010
22 Colt 0.010
23 Knov 0.009
24 Gotoh 0.006
25 Chrollo 0.006
26 Canary 0.005
27 Alluka Zoldyck 0.004
28 Kikyo Zoldyck 0.003
29 Knuckle 0.003
30 Ging Freecs 0.003
31 Silva Zoldyck 0.003
32 Cheetu 0.003
33 Komugi 0.002
34 Pakunoda 0.001
35 Uvogin 0.001
36 Bisky 0.001
37 Razor 0.001
38 Palm 0.001
39 Milluki Zoldyck 0.001
40 Pariston 0.001
41 Cheadle 0.001
42 Leol 0.001
43 Hanzo 0.000
44 Pokkle 0.000
45 Wing 0.000
46 Zushi 0.000
47 Nobunaga 0.000
48 Machi 0.000
49 Phinks 0.000
50 Shizuku 0.000
51 Franklin 0.000
52 Bolonelov 0.000
53 Shalnark 0.000
54 Kortopi 0.000
55 Kite 0.000
56 Genthru 0.000
57 Shoot 0.000
58 Melody 0.000
59 Tsubone 0.000
60 Amane 0.000
61 Bloster 0.000
62 Peggy 0.000
63 Koala 0.000
64 Gyro 0.000
65 Sadaso 0.000
66 Gido 0.000
67 Riehlvelt 0.000
cat("\nTop 5 highest betweenness:\n")
Top 5 highest betweenness:
print(head(bw_df, 5)) character betweenness
1 Killua Zoldyck 0.300
2 Gon Freecs 0.213
3 Netero 0.060
4 Meleoron 0.040
5 Hisoka 0.037
ggraph(as_tbl_graph(hxh_g), layout = "manual", x = hxh_layout[,1], y = hxh_layout[,2]) +
geom_edge_link(alpha = 0.3) +
geom_node_point(aes(size = all_bw), fill = "yellow4", shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 1.8) +
scale_size(range = c(3, 15)) +
ggtitle("HxH Betweenness Centrality") +
theme_graph() +
theme(legend.position = "none")Closeness
Closeness centrality measures how close a node is to all other nodes in the network. It is 1 over the sum of the shortest path lengths to all other nodes. Also ranges from 0-1.
\[c_i = \frac{1}{\sum_{j \neq 1}d_{ij}}\]
A character with a large closeness centrality could reach other characters in the fewest steps on average.
#Individual closeness
gon_cl <- igraph::closeness(hxh_g, vids = "Gon Freecs", normalized = TRUE)
killua_cl <- igraph::closeness(hxh_g, vids = "Killua Zoldyck", normalized = TRUE)
hisoka_bw <- igraph::closeness(hxh_g, v = "Hisoka", normalized = TRUE)
cat("Closeness centrality:\n")Closeness centrality:
cat("Gon Freecs:", round(gon_cl, 3), "\n")Gon Freecs: 0.742
cat("Killua Zoldyck:", round(killua_cl, 3), "\n")Killua Zoldyck: 0.786
cat("Hisoka:", round(hisoka_bw, 3), "\n")Hisoka: 0.589
#Network closeness
all_cl <- igraph::closeness(hxh_g, normalized = TRUE)
cl_df <- data.frame(
character = V(hxh_g)$name,
closeness = round(all_cl, 3),
row.names = NULL
) %>% arrange(desc(closeness))
cat("\nTop 5 highest closeness:\n")
Top 5 highest closeness:
print(head(cl_df, 5)) character closeness
1 Killua Zoldyck 0.786
2 Gon Freecs 0.742
3 Netero 0.600
4 Hisoka 0.589
5 Meleoron 0.589
#Plot
ggraph(as_tbl_graph(hxh_g), layout = "manual", x = hxh_layout[,1], y = hxh_layout[,2]) +
geom_edge_link(alpha = 0.3) +
geom_node_point(aes(size = all_cl), fill = "cyan", shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 1.8) +
scale_size(range = c(3, 15)) +
ggtitle("HxH Closeness Centrality") +
theme_graph() +
theme(legend.position = "none")Eigenvector centrality
Measures a node’s importance based on the importance of the nodes it is connected to. Having one connection to a highly central character counts for more than having many connections to peripheral characters.
\[Ax = \omega x\]
all_ev <- igraph::eigen_centrality(hxh_g, scale = TRUE)$vector
gon_ev <- all_ev["Gon Freecs"]
cat("Gon Freecs:", round(gon_ev, 3), "\n")Gon Freecs: 0.903
morel_ev <- all_ev["Morel"]
cat("Morel:", round(morel_ev, 3), "\n")Morel: 0.304
Gyro_ev <- all_ev["Gyro"]
cat("Gyro:", round(Gyro_ev, 3), "\n")Gyro: 0.029
ev_df <- data.frame(
character = names(all_ev),
eigenvector = round(all_ev, 3),
row.names = NULL
) %>% arrange(desc(eigenvector))
cat("\nTop 5 highest eigenvector centrality:\n")
Top 5 highest eigenvector centrality:
print(head(ev_df, 5)) character eigenvector
1 Killua Zoldyck 1.000
2 Gon Freecs 0.903
3 Hisoka 0.713
4 Ilumi Zoldyck 0.662
5 Kalluto Zoldyck 0.599
cat("\nBottom 5 lowest eigenvector centrality:\n")
Bottom 5 lowest eigenvector centrality:
print(tail(ev_df, 5)) character eigenvector
63 Genthru 0.115
64 Ging Freecs 0.110
65 Pariston 0.082
66 Cheadle 0.082
67 Gyro 0.029
#Plot
ggraph(as_tbl_graph(hxh_g), layout = "manual", x = hxh_layout[,1], y = hxh_layout[,2]) +
geom_edge_link(alpha = 0.3) +
geom_node_point(aes(size = all_ev), fill = "orange2", shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 1.8) +
scale_size(range = c(3, 15)) +
ggtitle("HxH Eigenvector Centrality") +
theme_graph() +
theme(legend.position = "none")Community Detection
Networks are great for identifying communities in our data.
Community detection allow us to identify groups of nodes that are more densely connected to each other than to the rest of the network. This could reflect groups of friends, families, or assassins.
There are many algorithms to partition our data into communities. Here we would look at the Walktrap algorithm and the Louvain algorithm.
The Walktrap algorithm identifies communities by simulating short random walks through the network. The idea is that a random walker moving between nodes will tend to stay within communities, crossing between them every now and then.
The Louvain algorithm starts by assigning each node to its own community and then merges communities that increase modularity the most.
Regardless of the algorithm, in most occasions, we need calculate a modularity coefficient, Q, to figure out the partition that best assigns nodes to communities:
\[Q = \frac{1}{2E} \sum_{ij} \left( A_{ij} - \frac{k_i k_j}{2E} \right) \delta(c_i, c_j)\]
Here, Aij is the adjacency matrix, ki and kj are the degrees of nodes i and j, E is the total number of edges, and delta(ci, cj) is equal to 1 if nodes i and j are in the same community and 0 otherwise. A Q value closer to 1 indicates communitiness.
Modularity increases whenever two nodes in the same community are connected more than we would expect by chance and reduced whenever they are less than we would expect by chance.
# Walktrap community detection
walktrap_comm <- igraph::cluster_walktrap(hxh_g)
cat("Walktrap communities:", igraph::modularity(walktrap_comm), "\n")Walktrap communities: 0.4435091
cat("Number of communities:", igraph::sizes(walktrap_comm) %>% length(), "\n")Number of communities: 4
print(igraph::sizes(walktrap_comm)) #sizes() returns a named vector, with each element representing a community and it's value the number of nodesCommunity sizes
1 2 3 4
30 11 17 9
# Louvain community detection
louvain_comm <- igraph::cluster_louvain(hxh_g)
cat("\nLouvain modularity:", igraph::modularity(louvain_comm), "\n")
Louvain modularity: 0.4778082
cat("Number of communities:", igraph::sizes(louvain_comm) %>% length(), "\n")Number of communities: 4
print(igraph::sizes(louvain_comm))Community sizes
1 2 3 4
24 12 13 18
# Plot walktrap communities
set.seed(14)
ggraph(as_tbl_graph(hxh_g) %>%
mutate(community = as.factor(walktrap_comm$membership)),
layout = "manual", x = hxh_layout[,1], y = hxh_layout[,2]) +
geom_edge_link(alpha = 0.3) +
geom_node_point(aes(fill = community), size = 14, shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 1.8) +
ggtitle("HxH Walktrap Communities") +
theme_graph() +
theme(legend.position = "none")ggraph(as_tbl_graph(hxh_g) %>%
mutate(community = as.factor(louvain_comm$membership)),
layout = "manual", x = hxh_layout[,1], y = hxh_layout[,2]) +
geom_edge_link(alpha = 0.3) +
geom_node_point(aes(fill = community), size = 14, shape = 21, stroke = 1) +
geom_node_text(aes(label = name), size = 1.8) +
ggtitle("HxH Louvain Communities") +
theme_graph() +
theme(legend.position = "none")A Psychometric Example
As you probably already knew, we can also use networks to model psychological constructs. Here the nodes could represent questionnaire items and edges the statistical relationship between them (often partial correlations).
We will look at the big5 dataset from the qgraph package that has data from 500 participants who responded to 48 items from each of the Big 5 personality traits: openness to experience, extraversion, agreeableness, conscientiousness, and neuroticism.
library(qgraph)
data(big5) #Loads this built-in dataset into our environment as a dataframe
data(big5groups) #as a list
dim(big5) # number of participants and items[1] 500 240
head(big5[, 1:5]) # first 5 items, first 6 rows N1 E2 O3 A4 C5
[1,] 2 4 4 2 4
[2,] 3 4 5 3 4
[3,] 4 1 3 2 5
[4,] 4 4 3 2 4
[5,] 5 4 4 2 4
[6,] 2 4 5 3 4
lengths(big5groups) Neuroticism Extraversion Openness Agreeableness
48 48 48 48
Conscientiousness
48
range(big5) #response scale[1] 1 5
sum(is.na(big5)) [1] 0
Then we create a correlation matrix of all the items and graph it
big5_cor <- cor(big5)
big5_net <- qgraph(big5_cor,
groups = big5groups,
threshold = .1, #removes values under the selected value
title = "Big Five Network (Groups Layout)")qgraph(big5_cor,
groups = big5groups,
threshold = .1,
layout = "spring", #specifies the type of layout
title = "Big Five Network (Spring Layout)")Just like with the HxH data, we can examine which nodes are most central in our Big FIve network.
big5_cent <- centrality_auto(big5_net) #Computes different centrality measures
big5_node_cent <- as_tibble(big5_cent$node.centrality, rownames = "item") #Extracting the centrality measures and creating tibble
describe(big5_node_cent) vars n mean sd median trimmed mad min max
item* 1 240 120.50 69.43 120.50 120.50 88.96 1.00 240.00
Betweenness 2 240 103.62 119.05 65.50 82.68 77.84 0.00 918.00
Closeness 3 240 0.00 0.00 0.00 0.00 0.00 0.00 0.00
Strength 4 240 15.13 6.03 14.93 14.94 6.14 2.33 30.69
ExpectedInfluence 5 240 5.16 4.65 5.12 5.16 4.86 -10.25 15.95
range skew kurtosis se
item* 239.00 0.00 -1.22 4.48
Betweenness 918.00 2.42 9.60 7.68
Closeness 0.00 -0.10 -0.33 0.00
Strength 28.36 0.26 -0.55 0.39
ExpectedInfluence 26.21 -0.03 -0.20 0.30
#Betweenness densty plot
big5_node_cent %>%
ggplot(aes(x = Betweenness)) +
geom_density(fill = "green4", alpha = 0.5) +
geom_vline(aes(xintercept = mean(Betweenness)),
linetype = "dashed", color = "black") +
labs(title = "Distribution of Betweenness",
subtitle = "Dashed line = mean",
x = "Betweenness",
y = "Density") +
theme_minimal()#
qgraph(big5_net,
vsize = (big5_node_cent$Betweenness / 100 + 1),
layout = "spring",
groups = big5groups,
title = "Big Five Network - Node Size = Betweenness")#Clustering
big5_clust <- clustZhang(big5_net) #local clustering coefficient
describe(big5_clust$signed_clustZhang) vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 240 0.11 0.03 0.11 0.11 0.03 0.05 0.19 0.13 0.43 -0.37 0
The small world index measures the degree to which a network is a small world network. In small world networks, there is high clustering but short path lengths, meaning you can get to every node with few steps.
#Small Worldness
smallworldIndex(big5_net)$transitivity
[1] 0.505082
$transitivity_random
[1] 0.3583681
$APL
[1] 1.640202
$APL_random
[1] 1.600794
$index
[1] 1.375531
The small world index is larger than 1, which suggests that our Big Five network has a small world structure, to an extent (stringent thresholds like 3 are regularly used as well). This in turn suggests that while the items do cluster in their respective domains/constructs, any items are reachable from anywhere. This could lead to interesting interpretations and motivate future research.
Minihacks
Minihack 1
Using the provided Game of thrones data_set, in which the edges represent the presence of an alliance at an arbitrary point in the story (heavily conditioned by Nathan’s memory), create a network and an adjacency matrix. Plot the network using ggraph.
got_edge <- read.csv("https://www.dropbox.com/scl/fi/sfj18h0x66jcotgaelprb/GOT_edge.csv?rlkey=dtoxs3rn9nc2lmwlg0we8wa95&st=lmstdxlm&dl=1")Minihack 2
Choose three measures of centrality and calculate them for three characters of your choice and for the network. Select one of the chosen centrality measures and plot the network such that the size of the nodes reflect it.
Minihack 3
Use one of the community detection algorithms to see how many communities exist in this GOT data set. Plot it using ggraph. If you have watched the show, or read the books, do the communities make sense?
Minihack 4
For this mini-hack, you are going to do a network community detection of faculty members based on who applies to work in their labs.
The data set loaded here is from the last 3 years of graduate recruitment in our department. Each column is a faculty member and each row is a applicant (with names and other information removed). The numbers are the rankings of who each applicant prefers to work with (1 = most, 5 = least). We are going to correlate each of these columns to estimate the similarity between each faculty member and create a network from that information.
Your task is to do community detection on these data using the following:
Create an adjacency matrix of each facutly memeber using a Spearman rank correlation
cor(df, method = 'spearman')Make an weighted, undirected graph from the adjacency matrix into an object.
Use the following seed value for consistency:
set.seed(1985)Use the following community detection algorithm which can handle negative edge weights:
cluster_spinglass(mygraph, implementation = "neg")Plot the network and color each node colored by its community membership with
qgraphusing the code provided below as an outline.
After you have done that, in writing below, interpret what you see. Do the clusters make sense based on what you know about the interests and areas of the faculty?
# load data
gac_df <- read_csv("https://www.dropbox.com/scl/fi/ov5fec0ddx65yh9ys5slq/GAC_data_corrected.csv?rlkey=ybggu60vh50d9kj9ufuaeufes&st=vch5iwsk&dl=1")
# make adjacency matrix using spearman rank correlation
# make your adjacency matrix into a network graph
# set seed
set.seed(1985)
# community detection clustering using cluster_spinglass()
# plot: Uncomment the code below and change the variables to fit your data
# qgraph(your_adj_matrix, groups = as.factor(your_clusters$membership),
# palette = "ggplot2",
# layout = "spring",
# theme="colorblind", labels = colnames(your_adj_matrix))