The R-Code provided below is the brief introduction I provide to students and others scholars unfamiliar with R and Rstudio. I thought this would best serve the broader community if it was publicly available (Code: R-Code; Data: .csv, .dta).
# Welcome to the basic of R written by Ian A. Silver
# If you don't have R, please download it now at: www.r-project.org.
# If you don't have rstudio, please download it now at: www.rstudio.com.
# The datasets for this seminar should have been made available to you.
#___________________#
# Basics of R syntax:
#___________________#
# Saving stuff to the environment:
# "<-" allows you to save information to the environment >^.
# An "=" might work as well, but that gets confusing with more complex code
## Lets test it out:
x<-12
y<-43
## Basic Math:
xy<-x*y
## typing in the name of an item in the environment will make the value appear in the console below:
xy
# We can create vectors using a "c()" as well
z<- c(1,2,3,4,5,6,7,8,9,10)
z
## or we can simply state
z <-c(1:10)
z
# We can create a matrix using the matrix function as well
a<-matrix(
c(1:9), # the data elements
nrow=3, # number of rows
ncol=3, # number of columns
byrow = TRUE)
a
## Creating a second matrix
b<-matrix(
c(9:1), # the data elements
nrow=3, # number of rows
ncol=3, # number of columns
byrow = TRUE)
b
## Matrix Arithmetics
ab1<- a+b # Element wise addition (Basic addition)
ab2<- a-b # Element wise subtraction (Basic subtraction)
ab3<- a*b # Element wise multiplication (Basic Mulitplication)
aa1<- a %*% a # matrix multiplication (Basic Mulitplication)
ab4<- a %*% b # matrix multiplication (Basic Mulitplication)
ab1
ab2
ab3
aa1
ab4
# Other matrix code
a # a
t(a) # transpose of a, otherwise known as a'
crossprod(a) # a'*a inverse of a
crossprod(a,b) # a'*b inverse of a
crossprod(b,a) # b'*a inverse of a
solve(a)
a # a
rowMeans(a) #row means of a
rowSums(a) #row sums of a
colMeans(a) #column means of a
colSums(a) #column sums of a
diag(a) #provides the primary diagonal values
# Printing Results
sink("") # MUST ADD DIRECTORY
print(rowMeans(a), digits = 2)
sink()
# Structuring syntax:
# 4-# at the end of a line allows you to open or close a portion of your syntax.
# Like This ####
## Open
## Every line underneath this break will close until the next break
## Underneath this heading, I like to start with ## instead of 1.
# Getting Started ####
## I always start with a getting started break that has two sets of code.
## First, clearing queue:
rm(list = ls())
## Second, installing or loading the neccessary packages.
### Installing packages (Third Layer of Code)
install.packages("psych")
install.packages("pastecs")
setwd("") # MUST ADD DIRECTORY
### Loading packages
library(foreign) # If you dont have this package already, use install.packages to install the package to your copy of R
library(psych) # If you dont have this package already, use install.packages to install the package to your copy of R
library(pastecs) # If you dont have this package already, use install.packages to install the package to your copy of R
library(ggplot2) # If you dont have this package already, use install.packages to install the package to your copy of R
## Third, I always run my getting started options
### R-studio has default settings that when it
### will produce exponential numbers when a value is too large or small and
### and will start removing old printouts when the console becomes too long
### As such, we want to override the default options. The lines below allow us to do that
### "scipen" tells R-studio to not produce exponential numbers unless the number is 100 digits long
### "digits" tells R-studio to only produce values up till 12 digits.
options(scipen=100, digits=12)
### "max.print" tells R-studio to produce up till i number of lines of results
options(max.print = 500000)
## This marks the end of the getting started sub-code
# Loading data ####
## datasets from various sources can be loaded into R-studio,
## but it is always important to check how the data is loaded into the environment >^
## The most common form, and most acceptable form is a CSV
### in specifying the location on a desktop you state "drive/user/location/filename.csv"
### in specifying the location on a mac you state "~/location/filename.csv"
ds1<-read.csv("/data.1.csv", sep = ",") # for csv
### The code above automatically makes a dataframe (which is required to conduct analyses)
### addtional read ins include read.dta()
### for each of these additional specifications might need to be made
stata.e<-read.dta("/data.1.dta") # for stata
### The code above automatically makes a dataframe (which is often required to conduct analyses)
# Describing a dataframe (Descriptive Statistics) ####
## Using variables from a data frame is quite easy.
## We will focus on "ds1"
## First, lets see the variables contained in ds1.
names(ds1)
## Focusing on x1
### Let's see all the scores on x1
table(ds1$x1)
### Lets visualize the scores ()
ggplot()+ # Specifying to use ggplot
geom_histogram(aes(ds1$x1), binwidth = .1) # Specifying for the plot to be a histogram
ggplot()+ # All of the code below makes the plot look prettier. ggplot2 has unlimited options and requires a lot of google searches
geom_histogram(aes(ds1$x1), fill=1, alpha = .5, binwidth = .1)+
scale_y_continuous(limits = c(0,70),breaks=seq(0,70, by = 15))+scale_x_continuous(limits = c(-4,4),breaks=seq(-4,4, by = 1))+
ylab("frequency")+ xlab("x1")+
theme(axis.line = element_line(colour = "black"), panel.border= element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.background = element_blank(),axis.text=element_text(size=20), text=element_text(size=20))
### Lets find the mean of x1
mean(ds1$x1)
### Lets produce all the descriptive statistics for x1
stat.desc(ds1$x1) # produces nbr.val, nbr.null, nbr.na, min max, range, sum,
# median, mean, SE.mean, CI.mean, var, std.dev, coef.var
skew(ds1$x1) # produces skewness
kurtosi(ds1$x1) # produces kurtosis
## What about the other variables?
### Let's find the mean of x2 and x9
colMeans(ds1[, c("x2","x9")]) # all value lables must be put in parenteses for this function
### Let's find the mean of x2 through x9
names(ds1) # looking at the names again
colMeans(ds1[, c(2:9)])
### Let's produce all the descriptive statistics for x2 through x9
stat.desc(ds1[, c(2:9)]) # produces nbr.val, nbr.null, nbr.na, min max, range, sum,
# median, mean, SE.mean, CI.mean, var, std.dev, coef.var
skew(ds1[, c(2:9)]) # produces skewness
kurtosi(ds1[, c(2:9)]) # produces kurtosis
# Bivariate differences and associations (t-tests, covariances, correlations) ####
## After exploring data descriptively we can
## start to explore the bivariate associations between our variables
## t.tests for mean differences
t.test(ds1$x1,ds1$x2) # independent samples variance unequal
t.test(ds1$x1,ds1$x2, paired = T) # paired samples t-test variance unequal
### But what if the variances are equal?
var.test(ds1$x1,ds1$x2) # test to see if the variances are infact equal.
t.test(ds1$x1,ds1$x2, var.equal = T) # independent samples variance equal
t.test(ds1$x1,ds1$x2, paired = T, var.equal = T) # paired samples t-test variance equal
## Anova
anova.test<-aov(ds1$x1~ds1$group) # Simple Anova without any posthoc tests
### notice the results don't make much sense?
summary(anova.test) # This function requires the summary
# function to produce the proper output.
### Post-hoc tests
#### pairwise t.tests
pairwise.t.test(ds1$x1, ds1$group, p.adjust="none", pool.sd = T) # no p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="bonferroni", pool.sd = T) # Bonferroni p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="holm", pool.sd = T) # Holm p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="BH", pool.sd = T) # Benjamini & Hochberg p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="BY", pool.sd = T) # no p-value adjustments were made
#### Tukey posthoc test
TukeyHSD(anova.test, conf.level=.95)
## Covariance Matrix
CovT<-cov(ds1, use = "pairwise.complete.obs")
## Correlation Matrix
### For continuous constructs.
CorT<-corr.test(ds1, use = "pairwise.complete.obs", method = c("pearson", "kendall", "spearman")) # By Specifying multiple methods we allow the computer to specify the correlation estimation technique
print(CorT, digits = 5) # printing results with 5 digits
### For continuous, ordinal, and dichotomous constructs.
MCorT<-mixedCor(ds1, use = "pairwise.complete.obs",
method = c("pearson", "kendall", "spearman")) # For all levels of measurement
#### "mixedCor" function automatically detects level of measurement
#### or can have the user specify the level of measurement.
MCorP<-corr.p(MCorT$rho, n=1000) # p-values lower than pearson values
print(MCorT, digits = 5) # printing results with 5 digits
print(MCorP, digits = 5) # printing results with 5 digits
# Basic Regression Models (OLS, Generalized linear model (Binary logistic Regression)) ####
## OLS regression
### "lm" function is used for linear OLS modeling, This technique uses normal standard errors
Reg.1<-lm(y1~x2+x3, data = ds1, na.action = na.omit) # data specifies the data
# na.action specifies what to do with missing cases
### to view the results of Reg.1 we would use the summary command
Reg.1
summary(Reg.1) # summary should provide (Estimate; Std.Error; t-value; p-value;
# residuals; r-squared; adjusted r-squared; r-squared p-value; Degrees of freedom)
#### to test for multicolinearity we must use the olsrr package
library(olsrr) # This is the best diagnostics package for R offered when running linear models
ols_coll_diag(Reg.1) # ols coll diagnostics provides us with all of the information we need to
# identify collinearity within our model
### Additional commands must be specified to estimate the results with robust standard errors
### Robust standard errors adjust for heteroscedasticity in the data; most commonly used for clustering.
library(lmtest)
library(sandwich)
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC0"))
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC1")) # Type desciption can be found on pg.23 & 28 of sandwich manual ("HC1") should match stata output
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC2"))
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC3")) # Type desciption can be found on pg.23 & 28 of sandwich manual ("HC3") is the default for "vcovHC"
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC4"))
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC5")) # very minor differences exist between the style of robust standard error can be observed
### Additional commands must be specified to estimate the standardized coefficients
library(lm.beta)
Reg.1.b<-lm.beta(Reg.1) # "lm.beta" is pulled from the lm.beta package.
# This does not have additional specifications.
# "lm.beta" can be used in combination with the robust standard errors code ^
summary(Reg.1.b) # When summarized the standardized coefficients are added to the normal output
### Now lets produce our confidence intervals for all of the analyses
confint(Reg.1, level = .95) # "confint" allows us to produce the confident intervals for the unstandardized point estimates
# by specifying "Reg.1" confint produces confidence intervals for the unstandardized point estimates
confint(Reg.1.b, level = .95) # "confint" allows us to produce the confident intervals for the standardized point estimates
# by specifying "Reg.1.b" confint produces confidence intervals for the standardized point estimates
coefci(Reg.1, level = 0.95, vcov = vcovHC(Reg.1, type="HC1")) # "coefci" allows us to produce the confident intervals for the point estimates with robust standard errors
# Note that "Reg.1" is specified again, and the "vcov = vcovHC(Reg.1, type="HC1")" specification is made again
### Lets plot the association for the regression line between Y1 and X2! I will
#### Graphing just the line using the linear OLS regression model
ggplot(ds1, aes(x=y1, y=x2)) +
geom_smooth(method = lm, se = T, fullrange=F, level=.95,
linetype = 1, color = "black", fill = "firebrick1")
#### Graphing the line and data points using the linear OLS regression model
ggplot(ds1, aes(x=y1, y=x2))+
geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) +
geom_smooth(method = lm, se = T, fullrange=F, level=.95,
linetype = 1, color = "black", fill = "firebrick1")
#### Graphing the line and data points using the linear OLS regression model and making it look good
ggplot(ds1, aes(x=y1, y=x2))+
geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) +
geom_smooth(method = lm, se = T, fullrange=F, level=.95,
linetype = 1, color = "black", fill = "firebrick1") +
ylab("")+ylim(-4,4)+xlim(-4, 4)+
theme(axis.line = element_line(colour = "black"),panel.border= element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),axis.title.x=element_blank(), panel.background = element_blank(),
axis.text=element_text(size=20), text=element_text(size=20),plot.margin=unit(c(1,1,1,1),"cm"))
### Generalized Linear Model: binary logistic regression
#### The code for a binary logistic regression is very similar to the lm specification
#### accept for the added inclusion of the "family" function
#### the "family" function allows for the specification of numerous Generalized Linear Models, except negative binomial
#### see the specifications at https://www.rdocumentation.org/packages/stats/versions/3.5.3/topics/family
Reg.2<-glm(yd1~x2+x3, data = ds1, na.action = na.omit, family=binomial(link=logit)) # Family allows us to specify binary data (i.e., binomial)
# With a logit function (i.e., link=logit)
#### glm code provides less information than the lm code, due to the variety of models it can encompass
#### summary should provide the Estimate; Std.Error; t-value; p-value; residuals; Degrees of freedom
summary(Reg.2) # to produce detailed results the summary function must be used
#### to produce the 95 percent confidence intervals for the point estimates
#### the "confint" function is used again. "confint" works with both lm and glm specifications
confint(Reg.2, level = .95)
#### to produce odds ratios and 95 percent ci the following code must be used
exp(cbind("odds ratio" = coef(Reg.2), confint(Reg.2, level = .95))) # provides both the odds ratios and the odds ratios 95 percent CI
#### to produce the nagelkere R-squared and p-value the rcompanion package must be installed
install.packages("rcompanion")
library(rcompanion)
nagelkerke(Reg.2)
#### Graphing just the line using the linear binary logistic regression model
ggplot(ds1, aes(x=yd1, y=x2)) +
geom_smooth(method = "glm", se = T, fullrange=F, level=.95, family = binomial(link = "logit"), # geom_smooth is how we specify the regression technique in ggplot
linetype = 1, color = "black", fill = "firebrick1")
#### Graphing the line and data points using the linear OLS regression model
ggplot(ds1, aes(x=yd1, y=x2))+
geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) +
geom_smooth(method = glm, se = T, fullrange=F, level=.95, family = binomial(link = "logit"),
linetype = 1, color = "black", fill = "firebrick1")
#### Graphing the line and data points using the linear OLS regression model and making it look good
ggplot(ds1, aes(x=yd1, y=x2))+
geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) +
geom_smooth(method = lm, se = T, fullrange=F, level=.95, family = binomial(link = "logit"),
linetype = 1, color = "black", fill = "firebrick1") +
ylab("")+ylim(-4,4)+xlim(0, 1)+
theme(axis.line = element_line(colour = "black"),panel.border= element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),axis.title.x=element_blank(), panel.background = element_blank(),
axis.text=element_text(size=20), text=element_text(size=20),plot.margin=unit(c(1,1,1,1),"cm"))
# You have now learned the basics of r up until regression analysis ####
License: Creative Commons Attribution 4.0 International (CC By 4.0)