Basics of R

The R-Code provided below is the brief introduction I provide to students and others scholars unfamiliar with R and Rstudio. I thought this would best serve the broader community if it was publicly available (Code: R-Code; Data: .csv, .dta).

# Welcome to the basic of R written by Ian A. Silver 
# If you don't have R, please download it now at: www.r-project.org.
# If you don't have rstudio, please download it now at: www.rstudio.com.
# The datasets for this seminar should have been made available to you.


#___________________#
# Basics of R syntax: 
#___________________#
# Saving stuff to the environment:

# "<-" allows you to save information to the environment >^. 
# An "=" might work as well, but that gets confusing with more complex code

## Lets test it out:

x<-12
y<-43

## Basic Math:

xy<-x*y

## typing in the name of an item in the environment will make the value appear in the console below:

xy

# We can create vectors using a "c()" as well

z<- c(1,2,3,4,5,6,7,8,9,10)

z

## or we can simply state 

z <-c(1:10)

z

# We can create a matrix using the matrix function as well

a<-matrix( 
   c(1:9),  # the data elements 
   nrow=3,               # number of rows 
   ncol=3,               # number of columns 
   byrow = TRUE)  

a
## Creating a second matrix 

b<-matrix( 
   c(9:1),  # the data elements 
   nrow=3,               # number of rows 
   ncol=3,               # number of columns 
   byrow = TRUE) 


b

## Matrix Arithmetics 

ab1<- a+b # Element wise addition (Basic addition)
ab2<- a-b # Element wise subtraction (Basic subtraction)
ab3<- a*b # Element wise multiplication (Basic Mulitplication)
aa1<- a %*% a # matrix multiplication (Basic Mulitplication)
ab4<- a %*% b # matrix multiplication (Basic Mulitplication)


ab1
ab2
ab3
aa1
ab4

# Other matrix code 

a # a
t(a) # transpose of a, otherwise known as a'

crossprod(a) # a'*a inverse of a
crossprod(a,b) # a'*b inverse of a
crossprod(b,a) # b'*a inverse of a
solve(a)

a # a
rowMeans(a) #row means of a
rowSums(a)  #row sums of a
colMeans(a) #column means of a
colSums(a)  #column sums of a
diag(a)     #provides the primary diagonal values


# Printing Results
sink("") # MUST ADD DIRECTORY
print(rowMeans(a), digits = 2)
sink()

# Structuring syntax:
  # 4-# at the end of a line allows you to open or close a portion of your syntax. 





# Like This ####
## Open

## Every line underneath this break will close until the next break

## Underneath this heading, I like to start with ## instead of 1. 


# Getting Started ####
## I always start with a getting started break that has two sets of code.
## First, clearing queue:

rm(list = ls())

## Second, installing or loading the neccessary packages.
### Installing packages (Third Layer of Code)

install.packages("psych")
install.packages("pastecs")

setwd("") # MUST ADD DIRECTORY

### Loading packages

library(foreign) # If you dont have this package already, use install.packages to install the package to your copy of R
library(psych) # If you dont have this package already, use install.packages to install the package to your copy of R
library(pastecs) # If you dont have this package already, use install.packages to install the package to your copy of R
library(ggplot2) # If you dont have this package already, use install.packages to install the package to your copy of R

## Third, I always run my getting started options 

### R-studio has default settings that when it 
### will produce exponential numbers when a value is too large or small and
### and will start removing old printouts when the console becomes too long
### As such, we want to override the default options. The lines below allow us to do that

### "scipen" tells R-studio to not produce exponential numbers unless the number is 100 digits long
### "digits" tells R-studio  to only produce values up till 12 digits.

options(scipen=100, digits=12)

### "max.print" tells R-studio to produce up till i number of lines of results
options(max.print = 500000)

## This marks the end of the getting started sub-code 

# Loading data ####

## datasets from various sources can be loaded into R-studio, 
## but it is always important to check how the data is loaded into the environment >^

## The most common form, and most acceptable form is a CSV



### in specifying the location on a desktop you state "drive/user/location/filename.csv"
### in specifying the location on a mac you state "~/location/filename.csv"


ds1<-read.csv("/data.1.csv", sep = ",") # for csv

### The code above automatically makes a dataframe (which is required to conduct analyses)


### addtional read ins include read.dta()
### for each of these additional specifications might need to be made

stata.e<-read.dta("/data.1.dta") # for stata 

### The code above automatically makes a dataframe (which is often required to conduct analyses)





# Describing a dataframe (Descriptive Statistics) ####
## Using variables from a data frame is quite easy. 
## We will focus on "ds1"

## First, lets see the variables contained in ds1.

names(ds1)

## Focusing on x1

### Let's see all the scores on x1

table(ds1$x1)

### Lets visualize the scores ()

ggplot()+ # Specifying to use ggplot
  geom_histogram(aes(ds1$x1), binwidth = .1) # Specifying for the plot to be a histogram

ggplot()+ # All of the code below makes the plot look prettier. ggplot2 has unlimited options and requires a lot of google searches 
  geom_histogram(aes(ds1$x1), fill=1, alpha = .5, binwidth = .1)+ 
  scale_y_continuous(limits = c(0,70),breaks=seq(0,70, by = 15))+scale_x_continuous(limits = c(-4,4),breaks=seq(-4,4, by = 1))+ 
  ylab("frequency")+ xlab("x1")+
  theme(axis.line = element_line(colour = "black"), panel.border= element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), panel.background = element_blank(),axis.text=element_text(size=20), text=element_text(size=20))


### Lets find the mean of x1

mean(ds1$x1)

### Lets produce all the descriptive statistics for x1

stat.desc(ds1$x1) # produces nbr.val, nbr.null, nbr.na, min max, range, sum, 
                  # median, mean, SE.mean, CI.mean, var, std.dev, coef.var
skew(ds1$x1)      # produces skewness
kurtosi(ds1$x1)   # produces kurtosis 

## What about the other variables?

### Let's find the mean of x2 and x9

colMeans(ds1[, c("x2","x9")]) # all value lables must be put in parenteses for this function

### Let's find the mean of x2 through x9

names(ds1) # looking at the names again

colMeans(ds1[, c(2:9)]) 

### Let's produce all the descriptive statistics for x2 through x9

stat.desc(ds1[, c(2:9)])  # produces nbr.val, nbr.null, nbr.na, min max, range, sum, 
                         # median, mean, SE.mean, CI.mean, var, std.dev, coef.var
skew(ds1[, c(2:9)])      # produces skewness
kurtosi(ds1[, c(2:9)])   # produces kurtosis 


# Bivariate differences and associations (t-tests, covariances, correlations) ####

## After exploring data descriptively we can 
## start to explore the bivariate associations between our variables 

## t.tests for mean differences 

t.test(ds1$x1,ds1$x2) # independent samples variance unequal 
t.test(ds1$x1,ds1$x2, paired = T) # paired samples t-test variance unequal 

### But what if the variances are equal? 

var.test(ds1$x1,ds1$x2) # test to see if the variances are infact equal.

t.test(ds1$x1,ds1$x2, var.equal = T) # independent samples variance equal 
t.test(ds1$x1,ds1$x2, paired = T, var.equal = T) # paired samples t-test variance equal 
 
## Anova

anova.test<-aov(ds1$x1~ds1$group) # Simple Anova without any posthoc tests

### notice the results don't make much sense?

summary(anova.test) # This function requires the summary 
                    # function to produce the proper output.

### Post-hoc tests

#### pairwise t.tests 

pairwise.t.test(ds1$x1, ds1$group, p.adjust="none", pool.sd = T) # no p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="bonferroni", pool.sd = T) # Bonferroni p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="holm", pool.sd = T) # Holm p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="BH", pool.sd = T) # Benjamini & Hochberg p-value adjustments were made
pairwise.t.test(ds1$x1, ds1$group, p.adjust="BY", pool.sd = T) # no p-value adjustments were made

#### Tukey posthoc test
TukeyHSD(anova.test, conf.level=.95) 



## Covariance Matrix 

CovT<-cov(ds1, use = "pairwise.complete.obs")

## Correlation Matrix

### For continuous constructs. 

CorT<-corr.test(ds1, use = "pairwise.complete.obs", method = c("pearson", "kendall", "spearman")) # By Specifying multiple methods we allow the computer to specify the correlation estimation technique

print(CorT, digits = 5) # printing results with 5 digits

### For continuous, ordinal, and dichotomous constructs. 

MCorT<-mixedCor(ds1, use = "pairwise.complete.obs", 
                         method = c("pearson", "kendall", "spearman")) # For all levels of measurement

#### "mixedCor" function automatically detects level of measurement 
#### or can have the user specify the level of measurement.

MCorP<-corr.p(MCorT$rho, n=1000) # p-values lower than pearson values

print(MCorT, digits = 5) # printing results with 5 digits
print(MCorP, digits = 5) # printing results with 5 digits

# Basic Regression Models (OLS, Generalized linear model (Binary logistic Regression)) ####

## OLS regression

### "lm" function is used for linear OLS modeling, This technique uses normal standard errors

Reg.1<-lm(y1~x2+x3, data = ds1, na.action = na.omit) # data specifies the data
                                                     # na.action specifies what to do with missing cases

### to view the results of Reg.1 we would use the summary command

Reg.1
summary(Reg.1) # summary should provide (Estimate; Std.Error; t-value; p-value; 
               # residuals; r-squared; adjusted r-squared; r-squared p-value; Degrees of freedom)


#### to test for multicolinearity we must use the olsrr package

library(olsrr) # This is the best diagnostics package for R offered when running linear models

ols_coll_diag(Reg.1) # ols coll diagnostics provides us with all of the information we need to 
                     # identify collinearity within our model 

### Additional commands must be specified to estimate the results with robust standard errors
### Robust standard errors adjust for heteroscedasticity in the data; most commonly used for clustering.


library(lmtest)
library(sandwich)

coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC0")) 
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC1")) # Type desciption can be found on pg.23 & 28  of sandwich manual ("HC1") should match stata output
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC2"))
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC3")) # Type desciption can be found on pg.23 & 28 of sandwich manual ("HC3") is the default for "vcovHC"
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC4"))
coeftest(Reg.1, vcov = vcovHC(Reg.1, type="HC5")) # very minor differences exist between the style of robust standard error can be observed

### Additional commands must be specified to estimate the standardized coefficients 

library(lm.beta) 

Reg.1.b<-lm.beta(Reg.1) # "lm.beta" is pulled from the lm.beta package. 
                           # This does not have additional specifications.
                           # "lm.beta" can be used in combination with the robust standard errors code ^

summary(Reg.1.b) # When summarized the standardized coefficients are added to the normal output  

### Now lets produce our confidence intervals for all of the analyses

confint(Reg.1, level = .95) # "confint" allows us to produce the confident intervals for the unstandardized point estimates
                            #  by specifying "Reg.1" confint produces confidence intervals for the unstandardized point estimates


confint(Reg.1.b, level = .95)  # "confint" allows us to produce the confident intervals for the standardized point estimates
                               # by specifying "Reg.1.b" confint produces confidence intervals for the standardized point estimates

coefci(Reg.1, level = 0.95, vcov = vcovHC(Reg.1, type="HC1")) # "coefci" allows us to produce the confident intervals for the point estimates with robust standard errors
                                                              # Note that "Reg.1" is specified again, and the "vcov = vcovHC(Reg.1, type="HC1")" specification is made again


### Lets plot the association for the regression line between Y1 and X2! I will 


#### Graphing just the line using the linear OLS regression model 

ggplot(ds1, aes(x=y1, y=x2)) + 
  geom_smooth(method = lm, se = T, fullrange=F, level=.95, 
              linetype = 1, color = "black", fill = "firebrick1")

#### Graphing the line and data points using the linear OLS regression model 
ggplot(ds1, aes(x=y1, y=x2))+ 
  geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) + 
  geom_smooth(method = lm, se = T, fullrange=F, level=.95, 
              linetype = 1, color = "black", fill = "firebrick1")

#### Graphing the line and data points using the linear OLS regression model and making it look good

ggplot(ds1, aes(x=y1, y=x2))+ 
  geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) + 
  geom_smooth(method = lm, se = T, fullrange=F, level=.95, 
              linetype = 1, color = "black", fill = "firebrick1") +
  ylab("")+ylim(-4,4)+xlim(-4, 4)+
  theme(axis.line = element_line(colour = "black"),panel.border= element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),axis.title.x=element_blank(), panel.background = element_blank(), 
        axis.text=element_text(size=20), text=element_text(size=20),plot.margin=unit(c(1,1,1,1),"cm"))

### Generalized Linear Model: binary logistic regression
#### The code for a binary logistic regression is very similar to the lm specification
#### accept for the added inclusion of the "family" function
#### the "family" function allows for the specification of numerous Generalized Linear Models, except negative binomial
#### see the specifications at https://www.rdocumentation.org/packages/stats/versions/3.5.3/topics/family

Reg.2<-glm(yd1~x2+x3, data = ds1, na.action = na.omit, family=binomial(link=logit)) # Family allows us to specify binary data (i.e., binomial)
                                                                                    # With a logit function (i.e., link=logit)

#### glm code provides less information than the lm code, due to the variety of models it can encompass
#### summary should provide the Estimate; Std.Error; t-value; p-value; residuals; Degrees of freedom

summary(Reg.2) # to produce detailed results the summary function must be used


#### to produce the 95 percent confidence intervals for the point estimates
#### the "confint" function is used again. "confint" works with both lm and glm specifications 

confint(Reg.2, level = .95)

#### to produce odds ratios and 95 percent ci the following code must be used

exp(cbind("odds ratio" = coef(Reg.2), confint(Reg.2, level = .95))) # provides both the odds ratios and the odds ratios 95 percent CI

#### to produce the nagelkere R-squared and p-value the rcompanion package must be installed

install.packages("rcompanion")
library(rcompanion)

nagelkerke(Reg.2)

#### Graphing just the line using the linear binary logistic regression model 

ggplot(ds1, aes(x=yd1, y=x2)) + 
  geom_smooth(method = "glm", se = T, fullrange=F, level=.95, family = binomial(link = "logit"), # geom_smooth is how we specify the regression technique in ggplot
              linetype = 1, color = "black", fill = "firebrick1")

#### Graphing the line and data points using the linear OLS regression model 
ggplot(ds1, aes(x=yd1, y=x2))+ 
  geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) + 
  geom_smooth(method = glm, se = T, fullrange=F, level=.95, family = binomial(link = "logit"),
              linetype = 1, color = "black", fill = "firebrick1")

#### Graphing the line and data points using the linear OLS regression model and making it look good

ggplot(ds1, aes(x=yd1, y=x2))+ 
  geom_point(size= 2, shape = 18, color = "gray70", alpha = .4) + 
  geom_smooth(method = lm, se = T, fullrange=F, level=.95, family = binomial(link = "logit"),
              linetype = 1, color = "black", fill = "firebrick1") +
  ylab("")+ylim(-4,4)+xlim(0, 1)+
  theme(axis.line = element_line(colour = "black"),panel.border= element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),axis.title.x=element_blank(), panel.background = element_blank(), 
        axis.text=element_text(size=20), text=element_text(size=20),plot.margin=unit(c(1,1,1,1),"cm"))



# You have now learned the basics of r up until regression analysis ####

License: Creative Commons Attribution 4.0 International (CC By 4.0)