set.seed(03252015) # makes the random numbers reproducible, this is the date Mr. Zayn Malik left the group FYI.
n <- 100 # number of participants
# Participant IDs
id <- 1:n
# Age: roughly 16-40
age <- round(rnorm(n, mean = 26, sd = 4))
age <- pmax(pmin(age, 40), 16) # restrict to to [16, 40]
# Favorite member
favorite_member <- sample(
c("Harry", "Niall", "Liam", "Louis", "Zayn"),
size = n, replace = TRUE,
prob = c(0.35, 0.20, 0.20, 0.15, 0.20) # Harry most popular (sorry)
)
# Years as a fan
years_fan <- round(runif(n, min = 1, max = 13))
# Stan level: ordered — influences spending and concerts
stan_level <- sample(
c("casual", "moderate", "hardcore"),
size = n, replace = TRUE,
prob = c(0.35, 0.45, 0.20)
)
# Concerts attended: hardcore fans go to more
concerts_attended <- ifelse(stan_level == "hardcore", sample(3:12, n, replace = TRUE),
ifelse(stan_level == "moderate", sample(1:5, n, replace = TRUE),
sample(0:2, n, replace = TRUE)))
# Merch spending per year (USD): correlated with stan level
merch_spending <- ifelse(stan_level == "hardcore", round(rnorm(n, mean = 300, sd = 80)),
ifelse(stan_level == "moderate", round(rnorm(n, mean = 120, sd = 40)),
round(rnorm(n, mean = 30, sd = 15))))
merch_spending <- pmax(merch_spending, 0) # no negative spending
# Still follows solo careers?
still_follows_solo <- ifelse(stan_level == "hardcore",
sample(c(TRUE, FALSE), n, replace = TRUE, prob = c(0.90, 0.10)),
ifelse(stan_level == "moderate",
sample(c(TRUE, FALSE), n, replace = TRUE, prob = c(0.65, 0.35)),
sample(c(TRUE, FALSE), n, replace = TRUE, prob = c(0.35, 0.65))))
# Put it all together
onedirection_fan_data <- data.frame(id, age, favorite_member, years_fan, stan_level,
concerts_attended, merch_spending, still_follows_solo)General Programming in R
Background
You’ve seen R before! We’ll cover the core building blocks — data structures, indexing, and functions.
Our Dataset: Simulated Survey
Throughout this tutorial, we’ll work with a simulated dataset from a fictional survey of One Direction fans. Participants reported their age, favorite member, how long they’ve been a fan, how many concerts they’ve attended, how much they spend on merch per year, and their self-identified stan level.
Why simulate data? You can use it to test your analysis code before collecting real data, and it helps you understand what your variables actually are.
Let’s take a quick look:
We can preview the first couple of rows of the data using head(). And we can look at the structure of the variables using str().
head(onedirection_fan_data) id age favorite_member years_fan stan_level concerts_attended merch_spending
1 1 25 Louis 4 hardcore 7 407
2 2 27 Harry 7 moderate 5 70
3 3 17 Harry 9 hardcore 10 336
4 4 22 Niall 7 hardcore 9 256
5 5 25 Louis 5 hardcore 4 271
6 6 29 Niall 2 moderate 5 175
still_follows_solo
1 TRUE
2 FALSE
3 TRUE
4 TRUE
5 FALSE
6 FALSE
str(onedirection_fan_data)'data.frame': 100 obs. of 8 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ age : num 25 27 17 22 25 29 23 29 22 31 ...
$ favorite_member : chr "Louis" "Harry" "Harry" "Niall" ...
$ years_fan : num 4 7 9 7 5 2 1 6 7 6 ...
$ stan_level : chr "hardcore" "moderate" "hardcore" "hardcore" ...
$ concerts_attended : int 7 5 10 9 4 5 1 3 4 0 ...
$ merch_spending : num 407 70 336 256 271 175 142 157 226 36 ...
$ still_follows_solo: logi TRUE FALSE TRUE TRUE FALSE FALSE ...
Data Structures
R works with several types of objects. Knowing which type you’re dealing with is really really important!!
This table from Advanced R summarizes the basic data structures:
| Homogenous data | Heterogenous data | |
|---|---|---|
| 1-Dimensional | Atomic Vector | List |
| 2-Dimensional | Matrix | Data frame |
| N-Dimensional | Array |
Atomic Vectors
A vector is the simplest data structure in R: a sequence of values of the same type.
We can create simple vectors in this format, vector_name <- c(x, y, z).
logical_vec <- c(TRUE, FALSE, TRUE)
integer_vec <- c(1L, 2L, 3L)
double_vec <- c(1.1, 2.5, 3.9)
character_vec <- c("Harry", "Niall", "Liam", "Louis", "Zayn")Use typeof() to see the underlying type:
# Get type using typeof()
typeof(logical_vec)[1] "logical"
typeof(onedirection_fan_data$still_follows_solo)[1] "logical"
# integers are whole numbers, numeric vectors can hold decimals
typeof(integer_vec)[1] "integer"
typeof(onedirection_fan_data$concerts_attended)[1] "integer"
#does anyone use this?
typeof(double_vec)[1] "double"
typeof(character_vec)[1] "character"
typeof(onedirection_fan_data$favorite_member)[1] "character"
Use length() to count elements in our vectors:
length(character_vec)[1] 5
length(merch_spending)[1] 100
Relevance: Your condition labels, participant IDs, and response options are all vectors. Almost everything in R is built on top of them.
Factors
A factor is a special vector for categorical data. It stores data as integer codes with associated labels. Perfect for conditions, groups, or any nominal/ordinal variable.
# Convert favorite_member to a factor
typeof(onedirection_fan_data$favorite_member)[1] "character"
onedirection_fan_data$favorite_member <- factor(onedirection_fan_data$favorite_member)
# Inspect it
levels(onedirection_fan_data$favorite_member)[1] "Harry" "Liam" "Louis" "Niall" "Zayn"
nlevels(onedirection_fan_data$favorite_member)[1] 5
table(onedirection_fan_data$favorite_member)
Harry Liam Louis Niall Zayn
33 23 14 14 16
You can also create ordered factors for ordinal data (e.g., Likert scales):
# Convert stan_level to an ordered factor
onedirection_fan_data$stan_level <- factor(onedirection_fan_data$stan_level,
levels = c("casual", "moderate", "hardcore"),
ordered = TRUE)
head(onedirection_fan_data$stan_level)[1] hardcore moderate hardcore hardcore hardcore moderate
Levels: casual < moderate < hardcore
levels(onedirection_fan_data$stan_level)[1] "casual" "moderate" "hardcore"
# Ordered factors support comparisons
onedirection_fan_data$stan_level[1] > onedirection_fan_data$stan_level[2][1] TRUE
Coercion: Converting Between Types
Sometimes R reads your data in as the wrong type. Numbers stored as characters (common in Qualtrics exports). You can force a conversion using as.numeric(), as.factor(), as.character(), etc.
# Character to numeric
x <- c("1", "2", "3")
class(x)[1] "character"
x_num <- as.numeric(x)
class(x_num)[1] "numeric"
# Character to factor
# Create a quick character vector and convert it
member_char <- c("Harry", "Niall", "Liam", "Louis", "Zayn")
class(member_char)[1] "character"
member_factor <- as.factor(member_char)
class(member_factor)[1] "factor"
levels(member_factor)[1] "Harry" "Liam" "Louis" "Niall" "Zayn"
# What happens when coercion fails?
as.numeric(c("1", "two", "3")) # "two" becomes NA with a warningWarning: NAs introduced by coercion
[1] 1 NA 3
Relevance: R needs to know a variable is a factor to treat it as categorical in statistical models. If you run a regression and your grouping column is stored as a character, R may not handle it the way you expect.
Lists
A list can hold elements of different types, even other lists. Unlike a vector where everything has to be the same type, a list can mix characters, numbers, logicals, vectors, and even other lists all in one object.
This makes lists useful for storing information that belongs together but doesn’t fit neatly into a data frame, like metadata about your study.
survey_info <- list(
name = "1D Fan Survey", # character
n_subjects = n, # numeric
members = c("Harry", "Niall", "Liam", "Louis", "Zayn"), # vector
pilot = FALSE # logical
)You can access elements two ways — with $ by name, or with [[]] by name or position:
survey_info$name # using $[1] "1D Fan Survey"
survey_info[["name"]] # using [[]] by name — same result[1] "1D Fan Survey"
survey_info[[1]] # using [[]] by position[1] "1D Fan Survey"
# You can also check the structure of the whole list
str(survey_info)List of 4
$ name : chr "1D Fan Survey"
$ n_subjects: num 100
$ members : chr [1:5] "Harry" "Niall" "Liam" "Louis" ...
$ pilot : logi FALSE
Note: A single [] on a list returns another list. [[]] returns the actual element inside.
Data Frames
A data frame is the most common structure for research data. It’s like a spreadsheet: each column is a vector (all the same type), and each row is an observation.
nrow(onedirection_fan_data) # number of rows (participants)[1] 100
ncol(onedirection_fan_data) # number of columns (variables)[1] 8
dim(onedirection_fan_data) # both at once[1] 100 8
names(onedirection_fan_data) # column names[1] "id" "age" "favorite_member"
[4] "years_fan" "stan_level" "concerts_attended"
[7] "merch_spending" "still_follows_solo"
summary(onedirection_fan_data) # quick descriptive stats for each column id age favorite_member years_fan stan_level
Min. : 1.00 Min. :17.00 Harry:33 Min. : 1.00 casual :38
1st Qu.: 25.75 1st Qu.:23.00 Liam :23 1st Qu.: 3.00 moderate:39
Median : 50.50 Median :25.00 Louis:14 Median : 7.00 hardcore:23
Mean : 50.50 Mean :25.40 Niall:14 Mean : 6.51
3rd Qu.: 75.25 3rd Qu.:27.25 Zayn :16 3rd Qu.: 9.00
Max. :100.00 Max. :32.00 Max. :13.00
concerts_attended merch_spending still_follows_solo
Min. : 0.00 Min. : 0.0 Mode :logical
1st Qu.: 1.00 1st Qu.: 32.0 FALSE:44
Median : 2.00 Median : 88.0 TRUE :56
Mean : 3.12 Mean :127.3
3rd Qu.: 5.00 3rd Qu.:184.0
Max. :12.00 Max. :421.0
Indexing & Subsetting Data
Sometimes we’re only working with part of the whole dataset. We can quickly look through our data by indexing them.
Indexing Vectors
merch_spending[1] # first element[1] 407
merch_spending[1:5] # elements 1 through 5[1] 407 70 336 256 271
merch_spending[c(1, 3, 5)] # elements 1, 3, and 5[1] 407 336 271
merch_spending[merch_spending > 300] # elements greater than 300 [1] 407 336 316 370 339 421 385 304 311 350 409
Data Frames
Data frames are indexed with [rows, columns]:
onedirection_fan_data[1, ] # first row, all columns id age favorite_member years_fan stan_level concerts_attended merch_spending
1 1 25 Louis 4 hardcore 7 407
still_follows_solo
1 TRUE
onedirection_fan_data[, "age"] # age column, all rows (returns a vector) [1] 25 27 17 22 25 29 23 29 22 31 22 22 30 24 26 24 23 27 30 28 31 26 28 31 23
[26] 22 25 27 26 24 27 26 25 22 26 21 28 31 26 27 21 26 32 22 30 26 30 23 22 21
[51] 25 22 26 27 31 26 22 25 18 24 27 21 26 23 30 21 27 26 22 24 30 18 24 27 24
[76] 23 22 25 32 29 30 26 25 26 24 21 28 29 18 21 32 25 29 23 32 25 23 25 23 27
onedirection_fan_data$merch_spending # same as above using $ notation [1] 407 70 336 256 271 175 142 157 226 36 298 256 222 106 39 316 370 196
[19] 134 40 9 32 30 19 22 142 88 135 31 29 44 19 62 154 95 339
[37] 85 14 32 0 180 421 123 289 35 31 19 272 67 58 4 138 32 279
[55] 385 4 23 304 311 24 350 42 54 107 178 22 141 17 33 14 86 27
[73] 57 285 76 108 53 88 107 409 33 213 50 4 79 274 146 23 56 139
[91] 298 135 16 143 90 49 162 116 18 0
onedirection_fan_data[1:5, c("id", "favorite_member", "stan_level", "merch_spending")] id favorite_member stan_level merch_spending
1 1 Louis hardcore 407
2 2 Harry moderate 70
3 3 Harry hardcore 336
4 4 Niall hardcore 256
5 5 Louis hardcore 271
Tip: Leaving the row or column slot empty means “give me everything.”
Logical Subsetting
This is how you filter data, selecting rows that meet a condition:
# Only hardcore fans
hardcore <- onedirection_fan_data[onedirection_fan_data$stan_level == "hardcore", ]
nrow(hardcore)[1] 23
# Fans who spend more than $200 on merch
big_spenders <- onedirection_fan_data[onedirection_fan_data$merch_spending > 200, ]
head(big_spenders) id age favorite_member years_fan stan_level concerts_attended merch_spending
1 1 25 Louis 4 hardcore 7 407
3 3 17 Harry 9 hardcore 10 336
4 4 22 Niall 7 hardcore 9 256
5 5 25 Louis 5 hardcore 4 271
9 9 22 Zayn 7 hardcore 4 226
11 11 22 Zayn 11 hardcore 5 298
still_follows_solo
1 TRUE
3 TRUE
4 TRUE
5 FALSE
9 TRUE
11 TRUE
# Harry fans who still follow solo careers
harry_solo <- onedirection_fan_data[onedirection_fan_data$favorite_member == "Harry" & onedirection_fan_data$still_follows_solo == TRUE, ]
nrow(harry_solo)[1] 16
Working with Data
Reading in Data
Sometimes you’ll load in data from a CSV (e.g., exported from Qualtrics or a spreadsheet):
# Read a CSV from your working directory
my_data <- read.csv("https://www.dropbox.com/scl/fi/ol2shg9pbof0x2595in38/sim_onedirection_fan_data.csv?rlkey=9uuz318hhyfw253omdv696vao&dl=1")
write.csv(my_data, "example_data.csv", row.names = FALSE)
# Check where R is looking
getwd()[1] "/Users/rchavez/Library/CloudStorage/GoogleDrive-robert.s.chavez@gmail.com/My Drive/Teaching/PSY_607_data_science/2026s/class_tutorials"
# Set a new working directory if needed
# setwd("/path/to/your/folder")
# Session > Set Working Directory > Choose DirectoryInspecting and Cleaning
We can’t have dirty data!
# Check for missing values
sum(is.na(my_data))[1] 5
colSums(is.na(my_data)) id age favorite_member years_fan
0 2 0 1
stan_level concerts_attended merch_spending still_follows_solo
0 0 1 1
sum(is.na(onedirection_fan_data))[1] 0
colSums(is.na(onedirection_fan_data)) id age favorite_member years_fan
0 0 0 0
stan_level concerts_attended merch_spending still_follows_solo
0 0 0 0
# Rename a column (base R approach)
names(onedirection_fan_data)[names(onedirection_fan_data) == "merch_spending"] <- "merch_usd"
names(onedirection_fan_data)[1] "id" "age" "favorite_member"
[4] "years_fan" "stan_level" "concerts_attended"
[7] "merch_usd" "still_follows_solo"
# Rename back
names(onedirection_fan_data)[names(onedirection_fan_data) == "merch_usd"] <- "merch_spending"Descriptive Statistics
Base R has all the tools you need for quick summaries.
mean(onedirection_fan_data$merch_spending)[1] 127.31
median(onedirection_fan_data$merch_spending)[1] 88
sd(onedirection_fan_data$merch_spending)[1] 115.7802
var(onedirection_fan_data$merch_spending)[1] 13405.04
range(onedirection_fan_data$merch_spending)[1] 0 421
min(onedirection_fan_data$merch_spending)[1] 0
max(onedirection_fan_data$merch_spending)[1] 421
summary() gives you everything at once:
summary(onedirection_fan_data$merch_spending) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 32.0 88.0 127.3 184.0 421.0
summary(onedirection_fan_data$concerts_attended) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 1.00 2.00 3.12 5.00 12.00
Frequency tables with table():
table(onedirection_fan_data$favorite_member)
Harry Liam Louis Niall Zayn
33 23 14 14 16
table(onedirection_fan_data$favorite_member, onedirection_fan_data$stan_level) # cross-tabulation
casual moderate hardcore
Harry 14 13 6
Liam 9 8 6
Louis 6 5 3
Niall 4 7 3
Zayn 5 6 5
Relevance: You’ll run these before any inferential statistics. They help you catch data entry errors, check distributions, and understand your sample.
The apply Family
The apply functions let you apply a function across rows, columns, or groups without writing a loop.
apply() — over rows or columns of a matrix or data frame
# Create a numeric-only subset
numeric_onedirection_fan_data <- onedirection_fan_data[, c("age", "years_fan", "concerts_attended", "merch_spending")]
# Column means
apply(numeric_onedirection_fan_data, MARGIN = 2, FUN = mean) age years_fan concerts_attended merch_spending
25.40 6.51 3.12 127.31
tapply() — apply a function by group
Summarize a variable by group.
# Mean merch spending by stan level
tapply(onedirection_fan_data$merch_spending, onedirection_fan_data$stan_level, mean) casual moderate hardcore
28.28947 114.92308 311.91304
# Mean concerts attended by favorite member
tapply(onedirection_fan_data$concerts_attended, onedirection_fan_data$favorite_member, mean) Harry Liam Louis Niall Zayn
3.333333 2.391304 2.571429 4.214286 3.250000
sapply() — apply over a list or vector, simplify result
# Get the class of each column
sapply(onedirection_fan_data, class)$id
[1] "integer"
$age
[1] "numeric"
$favorite_member
[1] "factor"
$years_fan
[1] "numeric"
$stan_level
[1] "ordered" "factor"
$concerts_attended
[1] "integer"
$merch_spending
[1] "numeric"
$still_follows_solo
[1] "logical"
# Get descriptive stats for numeric columns
sapply(numeric_onedirection_fan_data, function(x) c(mean = mean(x), sd = sd(x))) age years_fan concerts_attended merch_spending
mean 25.400000 6.510000 3.120000 127.3100
sd 3.452418 3.534805 2.944881 115.7802
Conditional Logic & Loops
General form:
if (condition){ Do something } else { Do something different }
if / else
avg_spending <- mean(onedirection_fan_data$merch_spending, na.rm = TRUE)
avg_spending[1] 127.31
if (avg_spending >= 200) {
print("This is a high-spending fanbase")
} else if (avg_spending >= 75) {
print("This is a moderate-spending fanbase")
} else {
print("This is a low-spending fanbase")
}[1] "This is a moderate-spending fanbase"
# if/else only works on one value at a time
# To apply logic across a whole column, use ifelse()
onedirection_fan_data$spender_type <- ifelse(
onedirection_fan_data$merch_spending > 150, "big spender", "small spender"
)
table(onedirection_fan_data$spender_type)
big spender small spender
31 69
for Loops
# Let's loop through each One Direction member
# and print how many fans in our survey picked them as their favorite
for (member in levels(onedirection_fan_data$favorite_member)) {
# subset to just fans of this member
fan_subset <- onedirection_fan_data[onedirection_fan_data$favorite_member == member, ]
# count them
n_fans <- nrow(fan_subset)
cat(member, "has", n_fans, "fans in our sample\n")
}Harry has 33 fans in our sample
Liam has 23 fans in our sample
Louis has 14 fans in our sample
Niall has 14 fans in our sample
Zayn has 16 fans in our sample
# Now let's add a little more — mean merch spending per member
for (member in levels(onedirection_fan_data$favorite_member)) {
fan_subset <- onedirection_fan_data[onedirection_fan_data$favorite_member == member, ]
avg_spend <- round(mean(fan_subset$merch_spending, na.rm = TRUE), 2)
cat(member, "fans spend an average of $", avg_spend, "on merch\n")
}Harry fans spend an average of $ 122.73 on merch
Liam fans spend an average of $ 132.3 on merch
Louis fans spend an average of $ 122.43 on merch
Niall fans spend an average of $ 140.14 on merch
Zayn fans spend an average of $ 122.62 on merch
Tip: In R,
applyfunctions can sometimes be faster for working with data frames. But there’s a million ways you can do the same thing in R. How cool!
Mini-hacks
Mini-hack 1:
Create a new column in onedirection_fan_data called concert_goer that evaluates if a participant has gone to 3 or more concerts.
onedirection_fan_data$concert_goer <- ifelse(
onedirection_fan_data$concerts_attended >= 3, "TRUE", "FALSE"
)
table(onedirection_fan_data$concert_goer)
FALSE TRUE
54 46
Mini-hack 2:
Use tapply() to find the mean number of concerts attended for each favorite member.
tapply(onedirection_fan_data$concerts_attended, onedirection_fan_data$favorite_member, mean) Harry Liam Louis Niall Zayn
3.333333 2.391304 2.571429 4.214286 3.250000
Mini-hack 3:
Subset the data frame to include only fans aged 18–22. How many participants is that?
gen_z <- onedirection_fan_data[onedirection_fan_data$age >= 18 & onedirection_fan_data$age <= 22, ]
head(gen_z) id age favorite_member years_fan stan_level concerts_attended merch_spending
4 4 22 Niall 7 hardcore 9 256
9 9 22 Zayn 7 hardcore 4 226
11 11 22 Zayn 11 hardcore 5 298
12 12 22 Zayn 5 hardcore 6 256
26 26 22 Zayn 7 moderate 3 142
34 34 22 Liam 8 moderate 5 154
still_follows_solo spender_type concert_goer
4 TRUE big spender TRUE
9 TRUE big spender TRUE
11 TRUE big spender TRUE
12 TRUE big spender TRUE
26 FALSE small spender TRUE
34 TRUE big spender TRUE
nrow(gen_z)[1] 22
Mini-hack 4:
Use a for loop to print the proportion of fans who still follow solo careers for each stan level.
for (level in levels(onedirection_fan_data$stan_level)) {
level_subset <- onedirection_fan_data[onedirection_fan_data$stan_level == level, ]
prop <- round(mean(level_subset$still_follows_solo, na.rm = TRUE), 2)
cat(level, "fans:", prop * 100, "% still follow solo careers\n")
}casual fans: 24 % still follow solo careers
moderate fans: 64 % still follow solo careers
hardcore fans: 96 % still follow solo careers