# Intro to R for Data Analysis # by Dr. Martin Calvino # March 24, 2022 # WHY are for data analysis? # is free / extensive data analysis and graphic capabilities / can import data from many sources / # integration with Python # a VARIABLE is a container in which a value may be stored # a value is assigned to a variable using the assignment operator ' <- ' # 'Option' plus '-' keys in Mac // 'Alt' plus '+' or '-" on Windows to type assignment operator # Bank of America loan data in NJ for 2020 as 'boa' boa <- read.csv(file.choose()) # load dataset ?read.csv() # inspect description of function ?file.choose() # functions are reusable pieces of code View(boa) # preview dataset: columns are called VARIABLES & rows are called OBSERVATIONS str(boa) # inspect the structure of dataset is.data.frame(boa) # we have saved dataset as data frame in R # DATA STRUCTURES in R # VECTORS > one dimensional array of numerica data / character data / logical data # data in a vector must be of one type only # the combine function c() is used to form a vector # loan amount as 'la' la <- boa$loan_amount # select loan_amount variable from dataset and store it in the vector 'la' la[1:10] # inspect the first 10 values stored in vector 'la' la[10] la[c(3, 9)] la.1 <- c(55000, 105000, 505000) la.2 <- c(la[1:3]) la.1 la.2 # your turn! select variable from 'boa' that contains character data and save it into a vector # DATA FRAMES > two dimensional arrays in which different variables (columns) can contain different modes of data (numeric, character, and so on) # a data frame can be created with the data.frame() function # create reduced 'boa' dataset as the data frame 'r.boa' # save variables of choice from 'boa' as vectors, then combine vectors to create 'r.boa' pv <- boa$property_value # pv is a vector holding the variable property_value from boa inc <- boa$income # inc is a vector holding the variable income from boa inc <- inc*1000 # we multimply income by 1000 to have the same scale as property values se <- boa$derived_sex # se is a vector holding the variable derived_sex from boa et <- boa$derived_ethnicity # et is a vector holding the variable derived_ethnicity from boa ra <- boa$derived_race # ra is a vector holding the variable derived_ethnicity from boa r.boa <- data.frame(la, pv, inc, se, et, ra) # construct data frame with vectors as columns View(r.boa) str(r.boa) # how to access elements of a data frame r.boa[1:3] # access first three variables (columns) r.boa[c("pv", "ra")] # access second and last variables (columns) r.boa$se[1:10] # access the first 10 values from the variable (columns) sex in r.boa dataframe head(r.boa, n = 10) tail(r.boa, n = 25) # access quick summary of a data frame summary(r.boa) # your turn! select 4 viariables of interest from 'boa' dataframe and store each of them into a vector # create your own data frame by combining these vectors as variables (columns) using data.frame() function # inspect the summary of your newly created data frame using summary() function # FACTORS > nominal, ordinal or continuous variables # Nominal Variables are categorical without implied order (sex, ethnicity, race variables in r.boa) # Ordinal Variables are categorical implying order but not amount (loan_purpose and action_taken variables in boa) # Continuous Variables can take any value within a range (median_age_of_housing_units variable in boa) # Categorical-nominal and categorical-ordinal variables are called FACTORS in R # the function factor() stores categorical variables as vectors sex <- factor(boa$derived_sex) levels(sex) ethnicity <- factor(boa$derived_ethnicity) levels(ethnicity) # your turn! create a vector holding categorical values for the variable ra (race) from r.boa dataframe # use the factor() function # MISSING VALUES (NAs) # inspecting r.boa we realize that we need to recode # "Sex Not Available" in the variable 'se' as NA r.boa$se[r.boa$se == "Sex Not Available"] <- NA head(r.boa$se, n = 50) # "Ethnicity Not Available" in the variable 'et' as NA r.boa$et[r.boa$et == "Ethnicity Not Available"] <- NA head(r.boa$et, n = 50) # "Race Not Available" in the variable 'ra' as NA r.boa$ra[r.boa$ra == "Race Not Available"] <- NA head(r.boa$ra, n = 50) View(r.boa) # we did it! # PACKAGES (libraries containing collections of functions) # VIM is a package that visualizes missing values in your dataframe install.packages("VIM", dependencies = TRUE) # install package VIM library(VIM) # load the package to your working session browseVignettes("VIM") # access documentation on VIM aggr(r.boa, prop = TRUE, numbers = TRUE) # all variables in data frame have missing values except 'la' (loan amount) variable ?aggr() # REMOVE missing values with na.omit() function r.boa <- na.omit(r.boa) # remove all missing values (NAs) from data frame r.boa ?na.omit() aggr(r.boa, prop = TRUE, numbers = TRUE) # no missing values! View(r.boa) # identify and remove outliers with psych package install.packages("psych", dependencies = TRUE) library(psych) browseVignettes("psych") # access documentation on psych boxplot(r.boa[, c(1:3)])$out # identify outliers for numeric variables only install.packages("tidyverse", dependencies = TRUE) library(tidyverse) browseVignettes("tidyverse") # access documentation on tidyverse outliers.pv <- boxplot(r.boa[, 2])$out # identify outliers for property value r.boa <- r.boa[-which(r.boa[, 2] %in% outliers.pv), ] # remove outliers boxplot(r.boa[, c(1:3)])$out # outliers gone from property value variable in r.boat dataframe outliers.inc <- boxplot(r.boa[, 3])$out r.boa <- r.boa[-which(r.boa[, 3] %in% outliers.inc), ] boxplot(r.boa[, c(1:3)])$out # boxplots explained > https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51 # VISUALIZING DATA # we are using ggplot2 package to create graphs for data visualization # ggplot2 is part of the tidyverse package that we just installed browseVignettes("ggplot2") # access documentation on ggplot2 # in ggplot2 you construct your graphs by adding layers # these layers are based on the GRAMMAR OF GRAPHICS: # DATA > layer1 # AESTHETICS > layer2 # GEOMETRIES > layer3 # STATISTICS > layer4 # FACETS > layer5 # COORDINATES > layer6 # THEME > layer7 # guiding questions that help visualizing data! # what type of variation occurs within my variables in r.boa? # what type of co-variation occurs between my variables? # Visualize distribution > continuous variables > use HISTOGRAMS summary(r.boa[, c(1:3)]) # inspect upper limits of continuous variables # basic visualization ggplot(data = r.boa) + geom_histogram(mapping = aes(x = la)) # granular visualization ggplot(data = r.boa) + geom_histogram(mapping = aes(x = la), binwidth = 25000) + labs(title = "Loan Amount: applications Bank of America - 2020 - New Jersey", x = "Loan Amounts in Thousand of Dollars", y = "Number of Applications") + coord_cartesian(xlim = c(25000, 1115000)) + scale_x_continuous( breaks = c(100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000), labels = c("$100K", "$200K", "$300K", "$400K", "$500K", "$600K", "$700K", "$800K", "$900K", "$1M", "$1.1M", "$1.2M")) + scale_y_continuous(breaks = c(250, 500, 750, 1000, 1250, 1500, 1750), labels = c("250", "500", "750", "1,000", "1,250", "1,500", "1,750")) # your turn! visualize the distribution pv (property_value) and inc (income) variables for r.boa data frame # use the basic visualization approach # Visualize distribution > categorical variables > use BAR CHARTS # basic visualization ggplot(data = r.boa) + geom_bar(mapping = aes(x = ra)) + coord_flip() # granular visualization ggplot(data = r.boa) + geom_bar(mapping = aes(x = ra, fill = ra)) + coord_flip() + theme(legend.position = "none") + labs(title = "Race of home loan applicants - Bank of America - 2020 - NJ", y = "Number of Applicants", x = "") # Visualize covariation (the behavior between variables) # continuous - categorical (use boxplots) ggplot(data = r.boa) + geom_boxplot(mapping = aes(x = ra, y = pv)) + coord_cartesian(ylim = c(5000, 1275000)) + scale_y_continuous(breaks = c(200000, 400000, 600000, 800000, 1000000, 1200000), labels = c("$200K", "$400K", "$600K", "$800K", "$1M", "$1.2M")) + coord_flip() + labs(title = "Property Value in Loan Applications by Race", x = "Race of Applicant", y = "Property Value") # continuous - continuous (use scatterplots) ggplot(data = r.boa, aes(x = inc, y = pv, alpha = inc)) + geom_jitter() + geom_smooth(method = "lm", color = "red", linetype = 2) + scale_x_continuous(breaks = c(50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000), labels = c("$50K", "$100K", "$150K", "$200k", "$250K", "$300K", "$350K", "$400K")) + scale_y_continuous(breaks = c(200000, 400000, 600000, 800000, 1000000, 1200000), labels = c("$200K", "$400K", "$600K", "$800K", "$1M", "$1.2M")) + labs(title = "Income relative to Property Value in Home Loan Applications", x = "Income", y = "Property Value") # categorical - categorical (use dotplots) ggplot(data = r.boa) + geom_count(mapping = aes(x = se, y = ra)) + labs(title = "Sex of home loan applicants per race", x = "Sex", y = "Race")