First, I’ll import the data I downloaded through Spotify’s API, which consists of an egocentric network of The Grateful Dead (the Ego) and their related artists out to 10 steps removed. I’ll also load all of the libraries I’m using.
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(rio)
library(tidyverse)
library(tidygraph)
library(visNetwork)
library(rtweet)
library(igraph)
library(ggpubr)
library(ggimage)
library(magick)
library(ggrepel)
# Import Dataset of Grateful Dead related artists
# out to max of 10 steps
dead_relatives <- import("dead_relative_10steps.csv", skip = 1) %>%
select(-V1) %>%
rename(artist = V2,
relative = V3)
# load the image file for plots
# at the end of the script
img = "support/bears_t.png"
Right now we have an edge list, where each row corresponds to an edge in the Dead’s recommendation network between an artist and a related/recommended artist. I want to turn that into a graph object.
I’ll be using the tidygraph library, which is basically a tidy wrapper around igraph, for all of the network manipulation. I’ll be using the visNetwork to visualize the networks, since it makes nice, animated, interactive graphs.
dead_relatives %>%
# turn edgelist into network
as_tbl_graph() %>%
# activate the nodes to
# compute new node-level variables
activate(nodes) %>%
# get distance
mutate(distance = node_distance_to(nodes = 1, mode = 'all'),
group = distance) %>%
# arrange based on distance and name
# this makes it so distance legened is in order
arrange(distance, name) %>%
# use visIgraph to plot the network
visIgraph(physics = TRUE) %>%
# set options
visOptions(highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
# this allows people to select nodes by
# distance
selectedBy = "distance",
# and this allows people to select nodes by name
nodesIdSelection = TRUE) %>%
visLayout(randomSeed = 23) %>%
# avoidOverlap worked well here to make a clean graph
visPhysics(forceAtlas2Based = list("avoidOverlap" = 1)) %>%
# label the group (distance) legened, and tell it to not
# allow zooming on the legend
visLegend(main = "Distance", zoom = FALSE)
Next, I’ll look up the artists on Twitter and get their list of followers.
Next, we need to authenticate the session so that we can access the Twitter API. You need your keys for this part; mine are hidden from the markdown (html) file.
Now set up the authorization (again, those variables were defined in a hidden chunk above).
create_token("",
c_key,
c_secret,
a_token,
a_secret)
Here I search for each of the artists’ twitter accounts by searching for their name (as it appears on Spotify), match those search results with the artists’ name, and save that as a df
.
distinct_artists <- dead_relatives %>%
# remove repeat artists for twitter lookup
distinct(relative) %>%
as.data.frame()
user_names <- NULL
for(each in distinct_artists$relative){
username <- search_users(each, n = 1) # search for users that match artist names
user_names <- rbind(user_names, mutate(username,
relative = each))# combine into a df
}
Here I screen for returned search results that were under a similarity threshold with the search string in order to screen for mismatches (e.g., searching for ‘moe’ and getting an account other than the band). I played around with the threshold and landed on this one as it seemed to catch most of the actual misses.
distinct_artists %>%
left_join(user_names) %>%
select(relative, screen_name) %>%
# calculate levenshtein Similarity index
# which quantifies the similarity of two strings
mutate(str_sim = RecordLinkage::levenshteinSim(relative, screen_name)) %>%
# filter for usernames that are below .4 similarity (range 0 to 1)
filter(str_sim < .40)
Then, I set them to corrected account names if I could find them easily and otherwise set them to missing.
correct_handles <- distinct_artists %>%
left_join(user_names) %>%
select(relative, screen_name) %>%
# replacing screen names with corrected ones
# or missing if the artists aren't on twitter
mutate(screen_name =
case_when(relative == "moe." ~ "moeperiod",
relative == "New Riders of the Purple Sage" ~ NA_character_,
relative == "The New Deal" ~ NA_character_,
relative == "Doc & Merle Watson" ~ NA_character_,
relative == "Flat & Scrugs" ~ NA_character_,
relative == "Doc Watson" ~ NA_character_,
relative == "Bill Monroe" ~ "BILLMONROE1911",
relative == "Hot Rize" ~ NA_character_,
relative == "Lowell George" ~ NA_character_,
relative == "Leon Russell" ~ NA_character_,
relative == "Stephen Stills" ~ NA_character_,
relative == "The Derek Trucks Band" ~ NA_character_,
relative == "Derek & the Dominos" ~ NA_character_,
relative == "Dave Mason" ~ "davemasonband",
relative == "Tony Rice" ~ NA_character_,
relative == "The Band" ~ NA_character_,
relative == "Traffic" ~ NA_character_,
relative == "Ry Cooder" ~ NA_character_,
relative == "David Bromberg" ~ NA_character_,
relative == "Pure Prairie League" ~ NA_character_,
relative == "Al Kooper" ~ NA_character_,
relative == "Quicksilver Messenger Service" ~ NA_character_,
relative == "Savoy Brown" ~ NA_character_,
relative == "The Electric Flag" ~ NA_character_,
relative == "Peter Rowan" ~ NA_character_,
relative == "Poco" ~ NA_character_,
relative == "firefall" ~NA_character_,
relative == "Dan Fogelberg" ~ NA_character_,
relative == "Seals and Crofts" ~ NA_character_,
relative == "Orleans" ~ NA_character_,
relative == "Crosby & Nash" ~ NA_character_,
relative == "Rick Danko" ~ NA_character_,
relative == "The Youngbloods" ~ NA_character_,
relative == "John Phillips" ~ NA_character_,
relative == "Strve Young" ~ NA_character_,
relative == "Jim Ford" ~ NA_character_,
relative == "Cowboy" ~ NA_character_,
relative == "Henry Paul Band" ~ NA_character_,
relative == "Kelly Joe Phelps" ~ NA_character_,
relative == "John Hammond" ~ NA_character_,
relative == "Herb Pedersen" ~ NA_character_,
relative == "Gene Parsons" ~ NA_character_,
relative == "Joe Ely" ~ NA_character_,
relative == "New Monsoon" ~ "newmonsoon",
relative == "Brothers Past" ~ "BrothersPast",
relative == "The Werks" ~ "TheWerksMusic",
relative == "Space Bacon" ~ "SpaceBaconMusic",
relative == "Kung Fu" ~ NA_character_,
relative == "Future Rock" ~ NA_character_,
relative == "Exmag" ~ NA_character_,
relative == "SuperVision" ~ "ThatSuperVision",
relative == "Love & Light" ~ "LoveNLightMusic",
relative == "Nanda" ~ "Nanda_Musica",
relative == "R/D" ~ NA_character_,
relative == "Bird of Prey" ~ NA_character_,
relative == "Raq" ~ NA_character_,
relative == "U-Self" ~ NA_character_,
relative == "Life Force" ~ NA_character_,
TRUE ~ screen_name))
Then, I looked up users based on the corrected twitter handles so that I would actually get those artists’ data.
dead_relatives_tw_handles <- correct_handles %>%
left_join(lookup_users(correct_handles$screen_name))
Finally, I got the followers for each of the artists. Because of the rate limits, this step took a fair amount of computing time. I just wrote the files out into a subdirectory to be read back in.
for (i in 1:nrow(dead_relatives_tw_handles)){
# check to make sure screen name isn't missing
if(!is.na(dead_relatives_tw_handles$screen_name[i])){
# get that users' followers
get_followers(dead_relatives_tw_handles$screen_name[i],
n = dead_relatives_tw_handles$followers_count[i],
retryonratelimit = TRUE) %>%
# write out their follower list to a subdirectory and
# label it with the artist name
export(., paste0("./twitter_data/", dead_relatives_tw_handles$relative[i], "_followers.csv"))
}
}
Next, I ran the comparisons of centrality between spotify and twitter.
Here I read in and clean the twitter follower data.
# list the files
follower_files <- paste0("./twitter_data/", list.files("./twitter_data"))
# extract just the artist name
artists <- str_remove(follower_files, "./twitter_data/") %>%
str_remove("_followers.csv")
# import the files, and match them
# with the followers with the artist names
artist_follower <- follower_files %>%
map(import) %>%
map(mutate, user_id = as.character(user_id)) %>%
set_names(artists) %>%
enframe() %>%
rename(relative = name) %>%
unnest()
Here I transform the twitter data (which is also an edge list) into a network graph, again using tidygraph
.
dead_relatives_followers <- artist_follower %>%
mutate(relative = fct_relevel(relative,
"Grateful Dead")) %>%
arrange(relative) %>%
# turn edge list into network;
# tell it that alle edges are incoming ties
# since these are follower relations
as_tbl_graph(mode = "in")
dead_twitter_relatives <- dead_relatives_followers %>%
activate(nodes)
Here I transform the graph from a bi-partite artist-follower network into a artist-artist network, using the bipartite_mapping()
& bipartite_projection()
function.
dead_relatives_followers_bip <- dead_relatives_followers %>%
# get the bipartite type, which basically
# recognizes if each node is an artist or a follower.
mutate(., type = bipartite_mapping(.)$type) %>%
# turn into an igraph object
as.igraph() %>%
# project bipartite into a artist co-follwoer network
# which = "FALSE" means artist co-follower network;
# if we set which to "TRUE", it would become a
# follower co-followed-artist network
bipartite_projection(which = "FALSE") %>%
as_tbl_graph()
Here I visualize it as a circle. The network seems to be so dense and interconnected that this was the only way to get a remotely clean looking graph.
dead_relatives_followers_bip %>%
arrange(name) %>%
visIgraph(physics = TRUE) %>%
# setting layout to circle
visIgraphLayout(layout = "layout_in_circle") %>%
visOptions(highlightNearest = list(enabled = T, degree = 1, hover = T),
nodesIdSelection = TRUE) %>%
visLayout(randomSeed = 200)
Here I calculate Eigen Centrality for each of the networks and turn that into a tibble
with node id’s (artist names) and their ccentrality scores.
dead_rel_spot_imp <- dead_relatives %>%
as_tbl_graph() %>%
# need to activate nodes to calculate
# node properties like centrality
activate(nodes) %>%
# calculate eigen centrality
mutate(spotify_centrality = centrality_eigen()) %>%
as_tibble()
dead_rel_tw_imp <- dead_relatives_followers_bip %>%
mutate(twitter_centrality = centrality_eigen()) %>%
as_tibble()
Here I join the data and inspected the distributions, since centrality tends to be highly skewed.
p_dis <- dead_rel_spot_imp %>%
# join centrality measures
left_join(dead_rel_tw_imp) %>%
# gather
gather(network_measure, score, -name) %>%
# create separate columns for network and measure
separate(network_measure, c("network", "measure")) %>%
# density plot
ggplot(aes(x = score, fill = network)) +
geom_density(alpha = .5) +
ggthemes::theme_clean() +
labs("Distribution of Importance Scores in Each Network")
# adds background image; couldn't help myself :)
ggbackground(p_dis, img)
Noting the negative skew, I applied a log (base 10) transformation to each.
p_dis_log <- dead_rel_spot_imp %>%
left_join(dead_rel_tw_imp) %>%
gather(network_measure, score, -name) %>%
separate(network_measure, c("network", "measure")) %>%
ggplot(aes(x = score, fill = network)) +
geom_density(alpha = .5) +
# only difference b/w this & above
# is the scale_x_log10 here
scale_x_log10() +
ggthemes::theme_clean() +
labs("Distribution of Logged Importance in Each Network")
ggbackground(p_dis_log, img)
As you can see, the distributions aren’t perfectly normal but look much better.
Finally, I got a scatter plot and a smoothed linear regression line.
p_reg <- dead_rel_spot_imp %>%
left_join(dead_rel_tw_imp) %>%
ggplot(aes(x = twitter_centrality, y = spotify_centrality)) +
geom_point() +
geom_smooth(method = "lm", alpha = .1) +
# this puts the correlation on the graph
stat_cor(size = 5) +
ggthemes::theme_clean() +
labs(title = "Eigen Centrality in Spotify and Twitter:",
subtitle = "Artists Related to the Grateful Dead",
x = "Centrality in Twitter Co-Follower Network",
y = "Centrality in Spotify Related Artist Network") +
# we want to see this on log transformed scales given the
# the skew, which we can do by setting the scales here
scale_x_log10() +
scale_y_log10()
ggbackground(p_reg, img)
To get a better sense of what’s going on, I wanted to see who is highly central in both networks so I labelled points that are above .6 centrality in both networks
p_reg <- dead_rel_spot_imp %>%
left_join(dead_rel_tw_imp) %>%
ggplot(aes(x = twitter_centrality, y = spotify_centrality)) +
geom_point() +
geom_smooth(method = "lm", alpha = .1) +
ggthemes::theme_clean() +
labs(title = "Eigen Centrality in Spotify and Twitter:",
subtitle = "Most Central Artists in the Dead's Networks",
x = "Centrality in Twitter Co-Follower Network",
y = "Centrality in Spotify Related Artist Network") +
scale_x_log10() +
scale_y_log10() +
# this creates labels for points
geom_label_repel(aes(label = ifelse(spotify_centrality > .6 & twitter_centrality > .6, name, '')),
box.padding = 0.15,
point.padding = 0.6,
force = 20,
segment.color = 'grey50')
ggbackground(p_reg, img)
This calls out the least central nodes
p_reg <- dead_rel_spot_imp %>%
left_join(dead_rel_tw_imp) %>%
ggplot(aes(x = twitter_centrality, y = spotify_centrality)) +
geom_point() +
geom_smooth(method = "lm", alpha = .1) +
ggthemes::theme_clean() +
labs(title = "Eigen Centrality in Spotify and Twitter:",
subtitle = "Least Central Artists in the Dead's Networks",
x = "Centrality in Twitter Co-Follower Network",
y = "Centrality in Spotify Related Artist Network") +
scale_x_log10() +
scale_y_log10() +
geom_label_repel(aes(label = ifelse(spotify_centrality < .001 & twitter_centrality <.00001, name, '')),
box.padding = 0.15,
point.padding = 0.6,
force = 20,
segment.color = 'grey50')
ggbackground(p_reg, img)
The last thing we might want to look at are the points of divergence; which artists are more central in one network or the other?
p_reg <- dead_rel_spot_imp %>%
left_join(dead_rel_tw_imp) %>%
mutate(centrality_miss = (twitter_centrality - spotify_centrality)^2) %>%
ggplot(aes(x = twitter_centrality, y = spotify_centrality)) +
geom_point() +
geom_smooth(method = "lm", alpha = .1) +
ggthemes::theme_clean() +
labs(title = "Eigen Centrality in Spotify and Twitter:",
subtitle = "Largest Discrepancies in Centrality",
x = "Centrality in Twitter Co-Follower Network",
y = "Centrality in Spotify Related Artist Network") +
scale_x_log10() +
scale_y_log10() +
geom_label_repel(aes(label = ifelse(centrality_miss > .45, name, '')),
box.padding = 0.35,
point.padding = 0.2,
force = 58,
segment.color = 'grey50')
ggbackground(p_reg, img)