Getting started with R LanguageData framesReading and writing tabular data in plain-text files (CSV, TSV, etc.)Pipe operators (%>% and others)Linear Models (Regression)data.tableboxplotFormulaSplit functionCreating vectorsFactorsPattern Matching and ReplacementRun-length encodingDate and TimeSpeeding up tough-to-vectorize codeggplot2ListsIntroduction to Geographical MapsBase PlottingSet operationstidyverseRcppRandom Numbers GeneratorString manipulation with stringi packageParallel processingSubsettingDebuggingInstalling packagesArima ModelsDistribution FunctionsShinyspatial analysissqldfCode profilingControl flow structuresColumn wise operationJSONRODBClubridateTime Series and Forecastingstrsplit functionWeb scraping and parsingGeneralized linear modelsReshaping data between long and wide formsRMarkdown and knitr presentationScope of variablesPerforming a Permutation TestxgboostR code vectorization best practicesMissing valuesHierarchical Linear ModelingClassesIntrospection*apply family of functions (functionals)Text miningANOVARaster and Image AnalysisSurvival analysisFault-tolerant/resilient codeReproducible RUpdating R and the package libraryFourier Series and Transformations.RprofiledplyrcaretExtracting and Listing Files in Compressed ArchivesProbability Distributions with RR in LaTeX with knitrWeb Crawling in RArithmetic OperatorsCreating reports with RMarkdownGPU-accelerated computingheatmap and heatmap.2Network analysis with the igraph packageFunctional programmingGet user inputroxygen2HashmapsSpark API (SparkR)Meta: Documentation GuidelinesI/O for foreign tables (Excel, SAS, SPSS, Stata)I/O for database tablesI/O for geographic data (shapefiles, etc.)I/O for raster imagesI/O for R's binary formatReading and writing stringsInput and outputRecyclingExpression: parse + evalRegular Expressions (regex)CombinatoricsPivot and unpivot with data.tableInspecting packagesSolving ODEs in RFeature Selection in R -- Removing Extraneous FeaturesBibliography in RMDWriting functions in RColor schemes for graphicsHierarchical clustering with hclustRandom Forest AlgorithmBar ChartCleaning dataRESTful R ServicesMachine learningVariablesThe Date classThe logical classThe character classNumeric classes and storage modesMatricesDate-time classes (POSIXct and POSIXlt)Using texreg to export models in a paper-ready wayPublishingImplement State Machine Pattern using S4 ClassReshape using tidyrModifying strings by substitutionNon-standard evaluation and standard evaluationRandomizationObject-Oriented Programming in RRegular Expression Syntax in RCoercionStandardize analyses by writing standalone R scriptsAnalyze tweets with RNatural language processingUsing pipe assignment in your own package %<>%: How to ?R Markdown Notebooks (from RStudio)Updating R versionAggregating data framesData acquisitionR memento by examplesCreating packages with devtools

xgboost

Other topics

Cross Validation and Tuning with xgboost

library(caret) # for dummyVars
library(RCurl) # download https data
library(Metrics) # calculate errors
library(xgboost) # model

###############################################################################
# Load data from UCI Machine Learning Repository (http://archive.ics.uci.edu/ml/datasets.html)
urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
x <- getURL(urlfile, ssl.verifypeer = FALSE)
adults <- read.csv(textConnection(x), header=F)

# adults <-read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=F)
names(adults)=c('age','workclass','fnlwgt','education','educationNum',
                'maritalStatus','occupation','relationship','race',
                'sex','capitalGain','capitalLoss','hoursWeek',
                'nativeCountry','income')
# clean up data
adults$income <- ifelse(adults$income==' <=50K',0,1)
# binarize all factors
library(caret)
dmy <- dummyVars(" ~ .", data = adults)
adultsTrsf <- data.frame(predict(dmy, newdata = adults))
###############################################################################

# what we're trying to predict adults that make more than 50k
outcomeName <- c('income')
# list of features
predictors <- names(adultsTrsf)[!names(adultsTrsf) %in% outcomeName]

# play around with settings of xgboost - eXtreme Gradient Boosting (Tree) library
# https://github.com/tqchen/xgboost/wiki/Parameters
# max.depth - maximum depth of the tree
# nrounds - the max number of iterations

# take first 10% of the data only!
trainPortion <- floor(nrow(adultsTrsf)*0.1)

trainSet <- adultsTrsf[ 1:floor(trainPortion/2),]
testSet <- adultsTrsf[(floor(trainPortion/2)+1):trainPortion,]

smallestError <- 100
for (depth in seq(1,10,1)) {
        for (rounds in seq(1,20,1)) {
               
                # train
                bst <- xgboost(data = as.matrix(trainSet[,predictors]),
                               label = trainSet[,outcomeName],
                               max.depth=depth, nround=rounds,
                               objective = "reg:linear", verbose=0)
                gc()
               
                # predict
                predictions <- predict(bst, as.matrix(testSet[,predictors]), outputmargin=TRUE)
                err <- rmse(as.numeric(testSet[,outcomeName]), as.numeric(predictions))
               
                if (err < smallestError) {
                        smallestError = err
                        print(paste(depth,rounds,err))
                }     
        }
} 

cv <- 30
trainSet <- adultsTrsf[1:trainPortion,]
cvDivider <- floor(nrow(trainSet) / (cv+1))

smallestError <- 100
for (depth in seq(1,10,1)) { 
        for (rounds in seq(1,20,1)) {
                totalError <- c()
                indexCount <- 1
                for (cv in seq(1:cv)) {
                        # assign chunk to data test
                        dataTestIndex <- c((cv * cvDivider):(cv * cvDivider + cvDivider))
                        dataTest <- trainSet[dataTestIndex,]
                        # everything else to train
                        dataTrain <- trainSet[-dataTestIndex,]
                       
                        bst <- xgboost(data = as.matrix(dataTrain[,predictors]),
                                       label = dataTrain[,outcomeName],
                                       max.depth=depth, nround=rounds,
                                       objective = "reg:linear", verbose=0)
                        gc()
                        predictions <- predict(bst, as.matrix(dataTest[,predictors]), outputmargin=TRUE)
                       
                        err <- rmse(as.numeric(dataTest[,outcomeName]), as.numeric(predictions))
                        totalError <- c(totalError, err)
                }
                if (mean(totalError) < smallestError) {
                        smallestError = mean(totalError)
                        print(paste(depth,rounds,smallestError))
                } 
        }
} 
 
###########################################################################
# Test both models out on full data set

trainSet <- adultsTrsf[ 1:trainPortion,]

# assign everything else to test
testSet <- adultsTrsf[(trainPortion+1):nrow(adultsTrsf),]

bst <- xgboost(data = as.matrix(trainSet[,predictors]),
               label = trainSet[,outcomeName],
               max.depth=4, nround=19, objective = "reg:linear", verbose=0)
pred <- predict(bst, as.matrix(testSet[,predictors]), outputmargin=TRUE)
rmse(as.numeric(testSet[,outcomeName]), as.numeric(pred))

bst <- xgboost(data = as.matrix(trainSet[,predictors]),
               label = trainSet[,outcomeName],
               max.depth=3, nround=20, objective = "reg:linear", verbose=0)
pred <- predict(bst, as.matrix(testSet[,predictors]), outputmargin=TRUE)
rmse(as.numeric(testSet[,outcomeName]), as.numeric(pred))

Contributors

Topic Id: 3239

Example Ids: 11116

This site is not affiliated with any of the contributors.