Plot column(s) of a polygon vector map

Posted on July 27, 2015 by A.M. Barbosa

NOTE: I recently had a tutorial on the cartography R package, which makes mapping columns of a data frame much less troublesome. You may want to look at that instead.

If you have a polygon vector map in R and want to quickly map the values in one or more columns of its attribute table, you can use the plotMapColumns function. There’s an option to rasterize the map before plotting, which may be considerably faster and is TRUE by default, but you’ll need to use an appropriate raster.upscale value. This is the number by which the range of coordinates should be divided to get the number of pixels for the maps to be plotted; it’s advised to first check your range(coordinates(map)) and see for yourself which raster.upscale divisor will make reasonably computable raster maps – e.g., for geographical lat-lon, an upscale factor of 1 will usually work (you’ll have at most 360 x 180 pixels; actually you may want to lower raster.upscale to 0.5 or 0.3 if you need more detailed maps); but for a UTM projection (whose coordinates can be much larger values) you may need an upscale.factor of 10000 to get a reasonably computable number of pixels.

plotMapColumns <- function(map, # SpatialPolygons object
                           columns, # index(es) of column(s) of map@data containing the values to plot (there will be one output map per column)
                           rasterize = TRUE, # number by which the difference between maximum and minimum coordinates should be divided to get the number of pixels (if rasterize = TRUE); it's advised to first calculate min and max coordinates and see for yourself which divisor will make reasonably computable raster maps (e.g., for geographical lat-lon an upscale factor of 1 may work, but for a UTM projection you may need an upscale of factor 10,000!)
                           raster.upscale = 1, 
                           ...) # additional arguments for (sp)plot function
  {

  stopifnot(raster.upscale > 0 | is.null(raster.upscale),
            require(raster) | rasterize == FALSE
            )
 
  if (!all(columns %in% 1:length(names(map)))) stop ("index out of bounds; 'columns' must exist in map@data.")
 
  if (rasterize) {
    xmin <- min(coordinates(map)[,1])
    xmax <- max(coordinates(map)[,1])
    ymin <- min(coordinates(map)[,2])
    ymax <- max(coordinates(map)[,2])
    wdth <- round(xmax - xmin)
    hght <- round(ymax - ymin)
 
    #if (raster.upscale == "auto") {
      #max.length <- max(wdth, hght)
      #if (max.length > 500) raster.upscale <-
    #}
 
    #if (!is.null(max.rast.dim)) {
    #  rast.dim <- wdth * hght
    #}
 
    wdth <- wdth / raster.upscale
    hght <- hght / raster.upscale
    message("plotting map(s) with ", wdth, "x", hght, " pixels; consider rising 'raster.upscale' if this is taking too long, or lowering it if the resulting maps are too coarse.")
    require(raster)
    rast <- raster(nrows = hght, ncols = wdth, xmn = xmin, xmx = xmax, ymn = ymin, ymx = ymax)
  }  # end if rasterize I
 
  #if (centroids) {
  #  attr.table <- map@data
  #  map <- SpatialPointsDataFrame(coordinates(map))
  #  map@data <- attr.table
  #  rast <- raster(map)
  #} else {
    require(sp)
  #}
 
  n.cols <- length(columns)
  col.count <- 0
  for (i in columns) {
    col.count <- col.count + 1
    message("Plotting column ", col.count, " of ", n.cols, "...")
    if (rasterize) {
      map.rast <- rasterize(x = map, y = rast, field = names(map)[i], fun = 'last')
      plot(map.rast, main = names(map)[i], ...)
    }  # end if rasterize II
    else {
      print(spplot(map, zcol = names(map)[i], main = names(map)[i], ...))
    }  # end else
  }  # end for i
  message("Finished!")
}

[presented with Pretty R]

Usage example:

# download, unzip and import a map of countries:
download.file("http://biogeo.ucdavis.edu/data/world/countries_shp.zip", destfile = "countries_shp.zip")
unzip("countries_shp.zip")
countries <- rgdal::readOGR(dsn = ".", layer= "countries")
 
# see the data in the attributes table:
head(countries)
names(countries)
 
# use plotMapColumns with and without rasterizing:
plotMapColumns(countries, columns = 17:18, rasterize = TRUE, raster.upscale = 1)
plotMapColumns(countries, columns = 18, rasterize = FALSE)  # slower

You can add arguments for the (sp)plot function, to get e.g. different colour schemes. The plotMapColumns function is not (yet) included in a package.

Analyse and compare stepwise model predictions

Posted on July 3, 2015 by A.M. Barbosa

This function builds a generalized linear model with forward stepwise inclusion of variables, using AIC as the selection criterion, and gives the values predicted at each step, as well as their correlation with the final model predictions. This can be useful to see if a model with just the first few variables could already provide predictions very close to the final ones (see e.g. Fig. 3 in Muñoz et al., 2005). This function is now included in the fuzzySim package (Barbosa, 2015; version 1.3 just released). The correlation coefficient (cor.method) is “pearson” by default, but can be set to “spearman” or “kendall” as well.

stepByStep <- function(data, # name of dataset
                       sp.col, # index of species (target variable) column
                       var.cols, # indices of variables columns
                       family = binomial(link = "logit"),
                       Favourability = FALSE, # used only if family=binomial(logit)
                       trace = 0, # argument for 'step'
                       cor.method = "pearson") {
 
# version 1.3 (22 Jul 2015)
 
n.init <- nrow(data)
data <- na.omit(data)
na.loss <- n.init - nrow(data)
if (na.loss > 0) message(na.loss, " cases excluded due to missing values.")
 
response <- colnames(data)[sp.col]
null.model.formula <- as.formula(paste(response, "~", 1))
scope.formula <- as.formula(paste("~", paste(colnames(data)[var.cols], collapse = "+")))
mod <- step(glm(null.model.formula, family = family, data = data), scope = scope.formula, direction = "forward", trace = trace)
pred.final <- mod$fitted.values
 
if (!all(c("binomial", "logit") %in% mod$family)) Favourability <- FALSE
if (Favourability) fav.final <- Fav(model = mod)
 
model.vars.split <- sapply(mod$anova[ , 1], strsplit, split = " ")
model.vars <- lapply(model.vars.split, `[`, 2)
model.vars <- as.character(model.vars)[-1]
n.steps <- length(model.vars)
 
preds <- favs <- as.data.frame(matrix(nrow = nrow(data), ncol = n.steps))
cor.P <- cor.F <- vector("numeric", n.steps)
names(model.vars) <- names(preds) <- names(favs) <- names(cor.P) <- names(cor.F) <- paste0("step", 1:n.steps)
 
for (s in 1:n.steps) {
step.vars <- model.vars[1:s]
mod.step <- glm(as.formula(paste(response, "~", paste(step.vars, collapse = "+"))), family = family, data = data)
if (Favourability) {
favs[ , s] <- Fav(model = mod.step)
cor.F[s] <- cor(favs[ , s], fav.final, method = cor.method)
}
else {
preds[ , s] <- mod.step$fitted.values
cor.P[s] <- cor(preds[ , s], pred.final, method = cor.method)
}
}; rm(s)
 
if (Favourability) result <- list(predictions = favs, correlations = cor.F, variables = model.vars, model = mod)
else result <- list(predictions = preds, correlations = cor.P, variables = model.vars, model = mod)
 
return(result)
}

Usage example:

library(fuzzySim)
data(rotif.env)
 
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, cor.method = "spearman")
 
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, Favourability = TRUE, cor.method = "pearson")
 
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, family = poisson)

[presented with Pretty R]

Only forward selection is implemented for now. I may or may not eventually find the time to implement other selection methods.

REFERENCES

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Muñoz, A.R., Real R., BARBOSA A.M. & Vargas J.M. (2005) Modelling the distribution of Bonelli’s Eagle in Spain: Implications for conservation planning. Diversity and Distributions 11: 477-486

New publications on modTools functions

Posted on March 16, 2015 by A.M. Barbosa

Two articles have recently been published based on modTools material: a short one in the Journal of Brief Ideas featuring function standard01, and another in Methods in Ecology and Evolution featuring package fuzzySim (wich now includes modelling functions such as multGLM, FDR, multicol, etc. — see previous post). Please cite the articles when you use the functions!

Barbosa, A.M. (2015) Re-scaling of model evaluation measures to allow direct comparison of their values. Journal of Brief Ideas, 18 Feb 2015, DOI: 10.5281/zenodo.15487

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Several modEvA functions moved to fuzzySim

Posted on February 24, 2015 by A.M. Barbosa

Just a heads-up to let you know that the modelling functions formerly included in the modEvA package have been moved to package fuzzySim for operative and editorial reasons. These functions are: multGLM, modelTrim, Fav, getPreds, FDR, percentTestData, integerCols, multConvert and multicol. The rotif.env sample dataset has also moved to fuzzySim. Package modEvA now has dataset rotif.mods and keeps only functions strictly related to model evaluation and analysis, and not to model building.

The new modEvA version (currently 0.9) also includes some bug fixes, including one that made the packaged varPart function calculate the ABCoverlap incorrectly – thanks to Jurica Levatic for spotting this.

Calculate zonal statistics from rasters in multiple zip files

Posted on December 10, 2014 by A.M. Barbosa

This is a wrapper for the zonalFromZip function published in the previous post, for when you have multiple zip files with multiple raster files each (as in the WorldClim paleo-climate database), and you want to extract zonal statistics for them all automatically. To use it, you’ll need to have the zonalFromZip function loaded, as well as the raster R package.

zonalsFromZips <- function(zip.files, zones.map, rast.file.ext = ".tif", aux.file.ext = NULL, verbosity = 1, ...) {
  # v2.0 (2018/07/11)
  results <- vector("list", length(zip.files))
  names(results) <- basename(tools::file_path_sans_ext(zip.files))
  for (f in 1:length(zip.files)) {
    message("\nUNZIPPING FILE ", f, " OF ", length(zip.files), " (", basename(zip.files[f]), ")...")
    results[[f]] <- zonalFromZip(zip.file = zip.files[f], zones.map = zones.map, rast.file.ext = rast.file.ext, aux.file.ext = aux.file.ext, verbosity = verbosity, ...)
  }; rm(f)
  message("\nFINISHED ALL!")
  return(results)
}  # end zonalsFromZips function

The result is a list of dataframes, each containing the zonal stats for one of the .zip files of rasters. Usage example:

LGM.zonals <- zonalsFromZips(zip.files = list.files("/home/joe/LGM", full.names = TRUE), zones.map = provinces)

[presented with Pretty R]

Calculate zonal statistics from rasters in a zip file

Posted on November 28, 2014 by A.M. Barbosa

Imagine you have a zipped folder with a bunch of raster maps containing variables (e.g. the ones you can download from WorldClim or from CliMond), and you need to calculate zonal statistics from each of these rasters. The zonaFromZip function, provided below, automates this process without the need to unzip the folders. It extracts one raster at a time from the .zip, imports it to R, calculates zonal statistics for your zones raster map (the ‘mean’ function is used by default, but you can provide any other argument accepted by the zonal function of the R raster package), and then deletes the unzipped file before unzipping the next one, therefore requiring minimal disk space.

zonalFromZip <- function (zip.file, zones.map, fun, rast.file.ext = ".tif", aux.file.ext = NULL, delete.unzipped = TRUE, verbosity = 2, ...)
 # version 2.1 (updated 11/12/2018)
 # zip.file: path to the zip containing the raster maps to calculate zonal stats from
 # zones.map: map (in your R workspace) containing the spatial units to calculate zonal stats to; must be of class 'raster', 'SpatialPolygons' or 'SpatialPolygonsDataFrame' and have the same CRS (and resolution if raster) of the maps in 'zip.file'
 # fun: function to calculate (e.g. mean)
 # rast.file.ext: file extension of the raster maps in 'zip.file'
 # aux.file.ext: file extension for the auxiliary files (e.g. ".hdr" for .bil raster files, or ".rdc" for Idrisi .rst files)
 # ...: additional arguments to pass to the 'raster::zonal' (if 'zones.map' is a raster) or the 'raster::extract' function (if 'zones.map' is a SpatialPolygons* map), such as na.rm = TRUE
{
 require(raster)
 rast.files <- unzip(zip.file, list = TRUE) $ Name
 var.names <- unique(tools::file_path_sans_ext(rast.files))
 n.var <- length(var.names)
 zonal.stats <- vector("list", length(var.names))
 names(zonal.stats) <- var.names
 
 for (i in 1:n.var) {
 if (verbosity >= 1) message("Getting variable ", i, " of ", n.var)
 if (verbosity >= 2) message(" - unzipping file...")
 unzip(zip.file, files = paste0(var.names[i], rast.file.ext))
 if (!is.null(aux.file.ext)) unzip(zip.file, files = paste0(var.names[i], aux.file.ext))
 var.rast <- raster(paste0(var.names[i], rast.file.ext))
 if (!compareRaster(var.rast, zones.map, stopiffalse = FALSE)) {
 if (verbosity >= 2) message(" - cropping to zones raster...")
 var.rast <- crop(var.rast, zones.map)
 }
 
 if (verbosity >= 2) message(" - calculating zonal stats...")
 if (class(zones.map) %in% c("raster", "RasterLayer"))
 zonal.stats[[i]] <- raster::zonal(var.rast, zones.map, fun = fun, ...)
 else if (class(zones.map) %in% c("SpatialPolygons", "SpatialPolygonsDataFrame"))
 zonal.stats[[i]] <- raster::extract(var.rast, zones.map, fun = fun, df = TRUE, ...)
 else stop("'zones.map' must be of class 'raster', 'RasterLayer', 'SpatialPolygons' or 'SpatialPolygonsDataFrame'")
 
 if (verbosity >= 2) message(" - deleting unzipped file...")
 if (delete.unzipped) {
 unlink(list.files()[grep(pattern = paste0(var.names[i], rast.file.ext), x = list.files())])
 if (!is.null(aux.file.ext)) unlink(list.files()[grep(pattern = paste0(var.names[i], aux.file.ext), x = list.files())])
 } # end if delete
 } # end for i
 
 if (verbosity >= 1) message("Converting results to data frame...")
 zonal.stats <- as.data.frame(zonal.stats)
 zonal.stats <- subset(zonal.stats, select = c(1, seq(2, ncol(zonal.stats), 2)))
 colnames(zonal.stats)[1] <- "zone"
 colnames(zonal.stats)[-1] <- var.names
 if (verbosity >= 1) message("Finished!")
 return(zonal.stats)
}

Mind that you need the raster R package installed for this, and a raster map of the spatial units (zones) to which you want to extract the raster variables. Usage examples:

LGM.CCSM4.utm10 <- zonalFromZip(zip.file = "LGM/CCSM4.zip", zones.map = utm10, rast.file.ext = ".tif", aux.file.ext = NULL)
head(LGM.CCSM4.utm10)
 
WClim.utm10 <- zonalFromZip(zip.file = "bio_2-5m_bil.zip", zones.map = utm10, rast.file.ext = ".bil", aux.file.ext = ".hdr")
head(WClim.utm10)

Example for several .zip files within a folder at once (DEPRECATED – see next post’s zonalsFromZips function instead):

for (f in list.files("LGM")) {  
# "LGM" is the folder containing the zip files to extract zonal stats from
  name <- paste("LGM", tools::file_path_sans_ext(f), "utm10", sep = ".")
  zonstat <- zonalFromZip(zip.file = paste("LGM", f, sep = "/"), zones.map = utm10, raster.ext = ".tif", fun = "mean")
  assign(name, zonstat)
}

Download several zip files automatically

Posted on November 28, 2014 by A.M. Barbosa

I recently found out that a bunch of new past climate scenarios were made available on WorldClim, at least for past climate. While there may be more efficient ways to do this, here’s the R function I wrote to download several of them automatically based on the URLs (link locations) of the .zip files:

downloadZips <- function(zip.urls, zip.names = NULL, dir.name = NULL, unzip = FALSE) {
# zip.names: names to give the downloaded zip files (if different from the original ones)
# dir.name: name of the directory folder in which to store the downloaded files
# unzip: logical, whether or not to unzip the files after downloading
  stopifnot(is.null(zip.names) | length(zip.names) == length(zip.urls))
  if (!is.null(dir.name)) {
    dir.create(dir.name)
    setwd(dir.name)
    on.exit(setwd("../"))
  }
  if (is.null(zip.names))  zip.names <- tools::file_path_sans_ext(basename(zip.urls))
  for (z in 1:length(zip.urls)) {
    message("\nDownloading zip ", z, " of ", length(zip.urls), " (", zip.names[z], ")...\n")
    zip.file <- paste(zip.names[z], "zip", sep = ".")
    download.file(zip.urls[z], zip.file)
    if (unzip) {
      dir.create(zip.names[z])
      message("Unzipping file to folder '", zip.names[z], "'...")
      unzip(zip.file, exdir = zip.names[z])
    }  # end if unzip
  }  # end for z
  message("Finished!")
}  # end downloadZips function

Usage examples (mind that these large files take quite a while to download):

# LGM:
downloadZips(zip.urls = c("http://biogeo.ucdavis.edu/data/climate/cmip5/lgm/cclgmbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/lgm/mrlgmbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/lgm/melgmbi_2-5m.zip"), zip.names = c("CCSM4", "MIROC_ESM", "MPI_ESM_P"), dir.name = "LGM")
 
# Mid Holocene:
downloadZips(zip.urls = c("http://biogeo.ucdavis.edu/data/climate/cmip5/mid/bcmidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/ccmidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/cnmidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/hgmidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/hemidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/ipmidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/mrmidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/memidbi_2-5m.zip", "http://biogeo.ucdavis.edu/data/climate/cmip5/mid/mgmidbi_2-5m.zip"), zip.names = c("BCC_CSM1_1", "CCSM4", "CNRM_CM5", "HadGEM2_CC", "HadGEM2_ES", "IPSL_CM5A_LR", "MIROC_ESM", "MPI_ESM_P", "MRI_CGCM3"), dir.name = "MidHol")
 
# Last Inter-Glacial:
downloadZips(zip.urls = "http://biogeo.ucdavis.edu/data/climate/worldclim/1_4/grid/pst/lig/lig_30s_bio.zip", zip.names = "LIG_30s", dir.name = "LIG")

[presented with Pretty R]

Miller calibration statistics

Posted on November 3, 2014 by A.M. Barbosa

The MillerCalib function, now included in the modEvA package (version 0.7.2 just released), calculates Miller’s (1991) calibration statistics for a logistic regression model, namely the intercept and slope of the regression of the binary response variable on the logit of predicted probabilities. Optionally and by default, it also plots the corresponding regression line (black) over the reference diagonal (grey dashed line).

Calibration (or reliability) measures how a model’s predicted probabilities relate to observed species prevalence or proportion of presences in the modelled data. If predictions are perfectly calibrated, the slope will equal 1 and the intercept will equal 0, so the model’s calibation line will perfectly overlap with the reference diagonal. Note that Miller’s statistics assess the model globally: a model is well calibrated if the average of all predicted probabilities equals the proportion of presences in the modelled data. Therefore, good calibration is always attained on the same data used for building the model (Miller 1991). The function is more useful to see if a model stays well calibrated when extrapolated to a new dataset.

MillerCalib <- function(model = NULL, obs = NULL, pred = NULL, plot = TRUE, plot.values = TRUE, digits = 4, xlab = "", ylab = "", main = "Miller calibration", ...) {
  # version 1.3 (24 Jun 2015)
 
  model.provided <- ifelse(is.null(model), FALSE, TRUE)
 
  if (model.provided) {
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    obs <- model$y
    pred <- model$fitted.values
  } else { # if model not provided
    if (is.null(obs) | is.null(pred)) stop("You must provide either 'obs' and 'pred', or a 'model' object of class 'glm'")
  }
 
  stopifnot(
    length(obs) == length(pred),
    obs %in% c(0,1),
    pred >= 0,
    pred <= 1
  )
 
  logit <- log(pred / (1 - pred))
  mod <- glm(obs ~ logit, family = binomial)
  intercept <- mod$coef[[1]]
  slope <- mod$coef[[2]]
  std.err <- summary(mod)$coefficients["logit", "Std. Error"]
  slope.p <- abs((slope - 1) / sqrt(std.err^2 + 0))  # Paternoster 98; http://stats.stackexchange.com/questions/55501/test-a-significant-difference-between-two-slope-values
 
  if (plot) {
    ymin <- min(0, intercept)
    ymax <- max(1, intercept + 0.1)
    plot(c(0, 1), c(ymin, ymax), type = "n", xlab = xlab, ylab = ylab, main = main)
    abline(0, 1, col = "lightgrey", lty = 2)
    abline(intercept, slope)
    if (plot.values) {
      plotext <- paste("intercept =" , round(intercept, digits), "\nslope =", round(slope, digits), "\nslope p-value =", round(slope.p, digits))
      text(x = 0, y = ymax - 0.25, adj = 0, labels = plotext)
    }
  }
  return(list(intercept = intercept, slope = slope, slope.pvalue = slope.p))
}  # end MillerCalib function

[presented with Pretty R]

Input data can be either a glm model object with binomial family and logit link, or two vectors of observed binary (0 or 1) values and the corresponding predicted probabilities. The output is a named list with the calibration intercept and slope.

REFERENCES

Miller M.E., Hui S.L. & Tierney W.M. (1991) Validation techniques for logistic regression models. Statistics in Medicine, 10: 1213-1226

Wintle B.A., Elith J. & Potts J.M. (2005) Fauna habitat modelling and mapping: A review and case study in the Lower Hunter Central Coast region of NSW. Austral Ecology, 30: 719-738

R-squared measures for generalized linear models

Posted on October 30, 2014 by A.M. Barbosa

There are several ways of calculating (pseudo) R-squared values for logistic regression models, with no consensus about which is best. The RsqGLM function, now included in the modEvA package, calculates those of McFadden (1974), Cox & Snell (1989), Nagelkerke (1991), Tjur (2009), and the squared Pearson correlation between observed and predicted values:

RsqGLM <- function(obs = NULL, pred = NULL, model = NULL) {
  # version 1.2 (3 Jan 2015)
 
  model.provided <- ifelse(is.null(model), FALSE, TRUE)
 
  if (model.provided) {
    if (!("glm" %in% class(model))) stop ("'model' must be of class 'glm'.")
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    obs <- model$y
    pred <- model$fitted.values
 
  } else { # if model not provided
    if (is.null(obs) | is.null(pred)) stop ("You must provide either 'obs' and 'pred', or a 'model' object of class 'glm'")
    if (length(obs) != length(pred)) stop ("'obs' and 'pred' must be of the same length (and in the same order).")
    if (!(obs %in% c(0, 1)) | pred < 0 | pred > 1) stop ("Sorry, 'obs' and 'pred' options currently only implemented for binomial GLMs (binary response variable with values 0 or 1) with logit link.")
    logit <- log(pred / (1 - pred))
    model <- glm(obs ~ logit, family = "binomial")
  }
 
  null.mod <- glm(obs ~ 1, family = family(model))
  loglike.M <- as.numeric(logLik(model))
  loglike.0 <- as.numeric(logLik(null.mod))
  N <- length(obs)
 
  # based on Nagelkerke 1991:
  CoxSnell <- 1 - exp(-(2 / N) * (loglike.M - loglike.0))
  Nagelkerke <- CoxSnell / (1 - exp((2 * N ^ (-1)) * loglike.0))
 
  # based on Allison 2014:
  McFadden <- 1 - (loglike.M / loglike.0)
  Tjur <- mean(pred[obs == 1]) - mean(pred[obs == 0])
  sqPearson <- cor(obs, pred) ^ 2
 
  return(list(CoxSnell = CoxSnell, Nagelkerke = Nagelkerke, McFadden = McFadden, Tjur = Tjur, sqPearson = sqPearson))
}

[presented with Pretty R]

Input data can be either a glm model object or two vectors of observed binary (0 or 1) values and the corresponding predicted probabilities (only binomial-logit GLMs admitted in this case). The output is a named list of the calculated R-squared values. See also the Dsquared function.

REFERENCES

Cox, D.R. & Snell E.J. (1989) The Analysis of Binary Data, 2nd ed. Chapman and Hall, London

McFadden, D. (1974) Conditional logit analysis of qualitative choice behavior. In: Zarembka P. (ed.) Frontiers in Economics. Academic Press, New York

Nagelkerke, N.J.D. (1991) A note on a general definition of the coefficient of determination. Biometrika, 78: 691-692

Tjur T. (2009) Coefficients of determination in logistic regression models – a new proposal: the coefficient of discrimination. The American Statistician, 63: 366-372.

Search or replace a text string in all files within a folder

Posted on October 10, 2014 by A.M. Barbosa

Imagine you have your package skeleton, with its functions in separate files in the ‘R‘ folder and their help files in the ‘man‘ folder. Now imagine you change your mind about the name of a function or argument which may appear in several of these files. It can be painfully dull and error-prone to replace the name manually or file-by-file… The searchStringInFiles function below searches, and optionally replaces, a character string globally within the texts of all files in a given directory (it can also be used to search and replace text in files other than R skeletons).

searchStringInFiles <- function(path, pattern, replacement = NULL, recursive = TRUE, ...) {
  # version 2.0 (17 Apr 2015)
 
  odir <- getwd()
  setwd(path)
  files <- list.files(".", recursive = recursive)
 
  if (is.null(replacement)) {
    for (f in files) {
      if (length(grep(pattern, readLines(f))) > 0) {
        cat("\n", f, "\n", sep = "")
        cat(">>> contains '", pattern, "'!\n", sep = "")
      }  # end if length
    }  # end for f
 
  } else {
 
    for (f in files) {
      in.text <- readLines(f)
      out.text <- gsub(pattern = pattern, replacement = replacement, x = in.text, ...)
      cat(out.text, sep = "\n", file = f)
    }  # end for f
  }  # end else
 
  setwd(odir)
}  # end searchStringInFiles function

[presented with Pretty R]

Load the function and give it your (package) folder path, the character string you want to search for, and (optionally) the replacement for this string:

searchStringInFiles(path = “/home/user/Documents/myPackages/supeR”, pattern = “MYfunction”)

searchStringInFiles(path = “/home/user/Documents/myPackages/supeR”, pattern = “MYfunction”, replacement = “myFunction”)

Mind that global text replacements can get a bit out of control if not used and double-checked carefully. Beware! And backup your package before trying to pull off something like this. (Anyway, this can be undone with the same command after switching the pattern and replacement arguments, I think).