{Network Analysis Tutorial \ With Applications in \R }

Network Analysis Tutorial
With Applications in R  

Bruce A. Desmarais

University of Massachusetts Amherst

1  ^Introduction

1.1  ^This Tutorial

2  ^Introduction to R

2.1  ^Programming in R: First Steps

# In R, functions are executed as '<function.name>(<input>)'
# <input> is a comma-separated list of arguments
# The exception is the 'print()' function, which can 
# be executed by typing the name of the object to 
# print and hitting enter
# Try
print(x='Hello World')
# x is the only argument


2.2  ^Objects: Vectors and Matrices

# Vectors contain data of the same type
# Create a character vector
char_vec <- c('a','b','c')
# Look at it
char_vec

# Create a numeric vector
num_vec <- numeric(5)
num_vec

# Change Values
num_vec[1] <- 4
num_vec[2:4] <- c(3,2,1) 
num_vec

num_vec[5] <- '5'
num_vec

# Reference all but 3
num_vec[-3]

num_vec

# Matrices are 2-dimensional Vectors
# Create a matrix
MyMat <- matrix(1:25,nrow=5,ncol=5)
MyMat

# Access (or change) a cell
MyMat[1,3] 
MyMat[2,4] <- 200 
MyMat[2,4]

# Rows then columns
MyMat[1,]
MyMat[,3] <- c(1,1,1,1,1)
MyMat[,3]
MyMat

# Multiple rows/columns and negation
MyMat[1:3,-c(1:3)]

# The matrix (shortcut for network objects)
MyMat[,]

2.3  ^Objects: Data Frames

# A Data Frame is the conventional object type for a dataset
## Create a data frame containing numbers and a character vector
## Construct a letter vector
let_vec <- c('a','b','c','d','e')

## Combine various objects into a data frame
dat <- data.frame(MyMat, num_vec,let_vec, stringsAsFactors=F)

## Create/override variable names
names(dat) <- c("mm1","mm2","mm3","mm4","mm5","nv","lv")

# Variables can be accessed with '$'
dat$lv

# Or with matrix-type column indexing
dat[,7]

2.4  ^R Packages

# Use install.packages() to install
# library() or require() to use the package
install.packages('statnet') # - suite of great network analysis packages
install.packages('igraph') # - other great network analysis package
library(statnet)
 

# R is OPEN SOURCE,  SO LOOK AT THE CODE !!!
network.size

# And give credit to the authors
citation('statnet')

# BibTeX Users
toBibtex(citation("statnet"))

2.5  ^Interactions with the Hard Drive

# Setting the working directory
setwd("~/Dropbox/professional/Teaching/Consulting/UNCDataMatters/DataMattersMaterials2015/KrackhardtManagerData/")


# Saving dat as a .csv
write.csv(dat,'dat.csv', row.names=F)

# Loading it as such
dat2 <- read.csv('dat.csv',stringsAsFactors=F)

# Save to RData file and load
save(list=c("dat2","dat"),file="dat_and_dat2.RData")
load("dat_and_dat2.RData")
# understand that objects in the loaded objects will overwrite

# Save and load it all
save.image('ALL.RData')
load('ALL.RData')
# Beware of memory aggregation with save.image()!!

2.6  ^R can help

# When you know the function name exactly
help("evcent")
# or
?evcent

# Find help files containing a word
help.search("eigenvector")


R help files contain

2.7  ^Use R for graphics

# Initialize the plot
# first column of MyMat on x-axis
# second on y-axis
plot(MyMat[,1],MyMat[,2],pch=1:5,cex=3,col=1:5) 

# Draw a line
lines(MyMat[,1],MyMat[,2], lty =2,  col="grey45",lwd=3)

# Re-write the points
points(MyMat[,1],MyMat[,2],pch=1:5,cex=3, col=1:5)

# add a legend
legend("topleft", legend = c("Circle", "Triangle", "Plus", "Times", "Diamond"), col=1:5,pch=1:5)

# Check out all the other options
?par

# Save it this time (as a PDF)
pdf('myfirstplot.pdf')
plot(MyMat[,1],MyMat[,2],pch=1:5,cex=3,col=1:5)
lines(MyMat[,1],MyMat[,2], lty =2,  ,col="grey45",lwd=3)
points(MyMat[,1],MyMat[,2],pch=1:5,cex=3,col=1:5)
legend("topleft", legend = c("Circle", "Triangle", "Plus", "Times", "Diamond"), col=1:5,pch=1:5)
dev.off()

3  ^Introduction to Networks

3.1  ^Network Terminology and the Basics

3.2  ^Network and Network Data Types

3.3  ^Network Data

# Read in adjacency matrices
## read.csv creates a data frame object from a CSV file
## Need to indicate that there's no header row in the CSV
advice <- read.csv("Advice.csv", header=F)

reportsto <- read.csv("ReportsTo.csv", header = F)

# Read in vertex attribute data
attributes <- read.csv("KrackhardtVLD.csv")


3.4  ^ Creating Network Objects: Managers in a "Hi-Tech" Firm

# Read in the library for network analysis
library(network)

# Use the advice network dataset to create network object
adviceNet <- network(advice)

# Add the vertex attributes into the network
set.vertex.attribute(adviceNet,names(attributes),attributes)

# Add the organizational chart as a network variable
set.network.attribute(adviceNet,"reportsto",reportsto)

# Simple plot
## Set random number seed so the plot is replicable
set.seed(5)
## Plot the network
plot(adviceNet,displaylabels=T,label=get.vertex.attribute(adviceNet,"Level"),vertex.cex=2,label.cex=1,edge.col=rgb(150,150,150,100,maxColorValue=255),label.pos=5,vertex.col="lightblue")
# check out all the options with ?plot.network

4  ^The Individual Level: Actor Position Analysis

4.1  ^Connectedness: Degree Centrality

require(sna)
# (in-) Degree Centrality is the number of in-connections by node
dc <- degree(adviceNet, cmode="indegree")

# Store in vertex level data frame
attributes$dc <- dc

# Plot degree centrality against age
## Make a simple scatter plot
plot(attributes$Age,attributes$dc)
## Add a trend (i.e., regression) line
abline(lm(attributes$dc ~ attributes$Age))

# Plot network with node size proportional to Degree Centrality
## First normalize degree 
ndc <- dc/max(dc)
## Set random number seed so the plot is replicable
set.seed(5)
## Now plot
plot(adviceNet,displaylabels=T,label=get.vertex.attribute(adviceNet,"Level"),vertex.cex=3*ndc,label.cex=1,edge.col=rgb(150,150,150,100,maxColorValue=255),label.pos=5,vertex.col="lightblue")

4.2  ^Connectedness: Eigenvector Centrality

x = λ-1Ax
# Eigenvector Centrality Recursively Considers Neighbors' Centrality
ec <- evcent(adviceNet)

# Store in vertex level data frame
attributes$ec <- ec

# Plot eigenvector centrality against age
## Make a simple scatter plot
plot(attributes$Age,attributes$ec)
## Add a trend (i.e., regression) line
abline(lm(attributes$ec ~ attributes$Age))

# Plot network with node size proportional to eigenvector centrality
## First normalize
nec <- ec/max(ec)
## Set random number seed so the plot is replicable
set.seed(5)
## Now plot
plot(adviceNet,displaylabels=T,label=get.vertex.attribute(adviceNet,"Level"),vertex.cex=3*nec,label.cex=1,edge.col=rgb(150,150,150,100,maxColorValue=255),label.pos=5,vertex.col="lightblue")

4.3  ^Connectedness: Betweenness Centrality

# Betweenness Centrality Considers unlikely connections
# Proportion of shortest paths that pass through a vertex
bc <- betweenness(adviceNet,rescale=T)

# Store in vertex level data frame
attributes$bc <- bc

# Plot eigenvector centrality against age
## Make a simple scatter plot
plot(attributes$Age,attributes$bc)
## Add a trend (i.e., regression) line
abline(lm(attributes$bc ~ attributes$Age))

# Plot network with node size proportional to betweenness centrality
## First normalize
nbc <- bc/max(bc)
## Set random number seed so the plot is replicable
set.seed(5)
## Now plot
plot(adviceNet,displaylabels=T,label=get.vertex.attribute(adviceNet,"Level"),vertex.cex=3*nbc,label.cex=1,edge.col=rgb(150,150,150,100,maxColorValue=255),label.pos=5,vertex.col="lightblue")

4.4  ^Comparing Centrality Measures

# DC vs. EC
plot(dc,ec)

# DC vs. BC
plot(dc,bc)

# BC vs. EC
plot(bc,ec)

# Correlations among all of them
cor(cbind(ec,bc,dc))

4.5  ^Embeddedness: Clustering Coefficient

Clustering coefficient is the proportion of potential ties among a node's neightbors that exist.
# Read in library for clustering coefficient
require(igraph)
# Compute local transitivity, i.e., the clustering clef
anet <- graph.adjacency(adviceNet[,])
cc <- transitivity(anet,type="local")

# Store in data frame
attributes$cc <- cc
attributes

# Remove igraph before using statnet functions
detach(package:igraph)

# Plot network with node size proportional to clustering clef
## First normalize
ncc <- cc/max(cc)
## Set random number seed so the plot is replicable
set.seed(5)
## Now plot
plot(adviceNet,displaylabels=T,label=get.vertex.attribute(adviceNet,"Level"),vertex.cex=3*ncc,label.cex=1,edge.col=rgb(150,150,150,100,maxColorValue=255),label.pos=5,vertex.col="lightblue")

# Correlations among all of them
cor(cbind(ec,bc,dc,cc))

5  ^Group-Level Analysis: Communities and Clusters

5.1  ^Clustering by Structural Equivalence

# Blockmodeling is the Classical SNA Approach
# Goal is to group nodes based on structural equivalence

## Create clusters based on structural equivalence
eclusts <- equiv.clust(adviceNet)

## First check out a dendrogram to eyeball the number of clusters
plot(eclusts)

# Run a block model identifying six groups
adviceBlockM <- blockmodel(adviceNet, eclusts, k=6)

# Create block membership vector and colors
## Extract block memberships
bmems <- adviceBlockM$block.membership[adviceBlockM$order.vec]
## Create group colors
colVec <- c("black","white","red","blue","yellow","gray60")
## Assign colors to individual nodes based on block membership
bcols <- colVec[bmems]

set.seed(5)
## Now plot
plot(adviceNet,displaylabels=T,label=get.vertex.attribute(adviceNet,"Level"),vertex.cex=2,label.cex=1,edge.col=rgb(150,150,150,100,maxColorValue=255),label.pos=5,vertex.col=bcols)

5.2  ^Clustering Based on Modularity: Community Detection

Modularity = 1/(2m)Σij[Aij-kikj/(2m)]1(ci=cj)
# Modularity-based community detection popular in physics
# Modularity = Dense within communities, sparse across 
library(igraph)

# Convert into a graph
anet <- graph.adjacency(adviceNet[,])

## Use semi-greedy splitting and merging
mem <- spinglass.community(anet)$membership

# Check number of communities
max(mem)

# Get memberships and plot
detach("package:igraph")
bcols <- c("lightblue","yellow")
set.seed(5)
## Now plot
plot(adviceNet,displaylabels=T,label=get.vertex.attribute(adviceNet,"Department"),vertex.cex=2,label.cex=1,edge.col=rgb(150,150,150,100,maxColorValue=255),label.pos=5,vertex.col=bcols)

6  ^Network-Level: Testing Structural Hypotheses

6.1  ^Introduction

6.3  ^μ=0, the z-test

Suppose x is a large univariate sample of size n, and
T(x) =  1

√n
^
σ
 
n

i=1 
xi
Our null is that X has finite positive variance and a mean of zero.
-->

6.3  ^H0 for a network...

Maximum Entropy Null Distribution

Uniform/Equal Probability of Every Network

6.5  ^Conditional Uniform Graph Testing: Comparing Observed to Null

# Conditional Uniform Graph Tests
# "CUG" tests allow you to control for features of the observed network in the null
# We should test for transitivity (is there even a reason for community detection?)
# gtrans function in sna package measures graph transitivity
ctDens <- cug.test(adviceNet,gtrans,cmode=c("edges"),reps=500)

# Check results
ctDens

# Now lets look at something else
# Do more experienced managers have higher in-degree centrality?

# function to estimate correlation between in-degree and node attribute
indegCor <-  function(net,attr){
	require(sna)
	cor(degree(net,cmode="indegree"),attr)
	}


ctDens2 <- cug.test(adviceNet,indegCor,cmode=c("edges"),reps=500, FUN.args=list(attr = attributes$Tenure))

# Check results
ctDens2

## Can we incorporate both? 
## Lets talk about it tomorrow!