Calculate the area under the ROC curve of a model

Posted on September 4, 2013 by A.M. Barbosa

Like the model evaluation measures calculated by the threshMeasures function, the area under the ROC curve (AUC) assesses the discrimination performance of a model; but unlike them, it does not require the choice of a particular threshold above which to consider that a model predicts species presence, but rather averages discrimination performance over all possible thresholds at a given interval. Mind that there has been criticism to the AUC (Lobo et al. 2008, Jiménez-Valverde et al. 2013), although it is still one of the most widely used metrics in model evaluation. It is highly correlated with species prevalence, so this value is also provided by the AUC function (if simplif = FALSE, the default) for reference.

Although there are functions to calculate the AUC in other R packages (such as ROCR, PresenceAbsence and verification), the AUC function below is more compatible with the remaining functions in this blog and in the modEvA package where it is included (Barbosa et al. 2014), and can be applied not only to a set of presence-absence versus predicted values, but also directly to a model object of class glm.

AUC <- function(model = NULL, obs = NULL, pred = NULL, simplif = FALSE, interval = 0.01, FPR.limits = c(0, 1), curve = "ROC",  method = "rank", plot = TRUE, diag = TRUE, diag.col = "grey", diag.lty = 1, curve.col = "black", curve.lty = 1, curve.lwd = 2, plot.values = TRUE, plot.digits = 3, plot.preds = FALSE, grid = FALSE, xlab = "auto", ylab = "auto", ...) {
  # version 2.2 (17 Jan 2020)
  
  if (all.equal(FPR.limits, c(0, 1)) != TRUE) stop ("Sorry, 'FPR.limits' not yet implemented. Please use default values.")
  
  if (length(obs) != length(pred))  stop ("'obs' and 'pred' must be of the same length (and in the same order).")
  
  if (!is.null(model)) {
    if(!("glm" %in% class(model) && model$family$family == "binomial" && model$family$link == "logit")) stop ("'model' must be an object of class 'glm' with 'binomial' family and 'logit' link.")
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    obs <- model$y
    pred <- model$fitted.values
  }  # end if model
  
  dat <- data.frame(obs, pred)
  n.in <- nrow(dat)
  dat <- na.omit(dat)
  n.out <- nrow(dat)
  if (n.out < n.in)  warning (n.in - n.out, " observations removed due to missing data; ", n.out, " observations actually evaluated.")
  obs <- dat$obs
  pred <- dat$pred
  
  stopifnot(
    obs %in% c(0,1),
    #pred >= 0,
    #pred <= 1,
    interval > 0,
    interval < 1,
    curve %in% c("ROC", "PR"),
    method %in% c("rank", "trapezoid", "integrate")
  )
  
  n1 <- sum(obs == 1)
  n0 <- sum(obs == 0)
  
  if (curve != "ROC" && method == "rank") {
    method <- "trapezoid"
    #message("'rank' method not applicable to the specified 'curve'; using 'trapezoid' instead.")
  }
  
  if (method == "rank") {
    # next 3 lines from Wintle et al 2005 supp mat "roc" function
    xy <- c(pred[obs == 0], pred[obs == 1])
    rnk <- rank(xy)
    AUC <- ((n0 * n1) + ((n0 * (n0 + 1))/2) - sum(rnk[1 : n0])) / (n0 * n1)
    if (simplif && !plot) return(AUC)
  }

  if (any(pred < 0) | any(pred > 1)) warning("Some of your predicted values are outside the [0, 1] interval within which thresholds are calculated.")
  
  N <- length(obs)
  preval <- prevalence(obs)
  thresholds <- seq(0, 1, by = interval)
  Nthresh <- length(thresholds)
  true.positives <- true.negatives <- sensitivity <- specificity <- precision <- false.pos.rate <- n.preds <- prop.preds <- numeric(Nthresh)
  
  for (t in 1 : Nthresh) {
    true.positives[t] <- sum(obs == 1 & pred >= thresholds[t])
    true.negatives[t] <- sum(obs == 0 & pred < thresholds[t])
    sensitivity[t] <- true.positives[t] / n1
    specificity[t] <- true.negatives[t] / n0
    precision[t] <- true.positives[t] / sum(pred >= thresholds[t], na.rm = TRUE)
    #if (true.positives[t] == 0 && sum(pred >= thresholds[t], na.rm = TRUE) == 0)  precision[t] <- 0  # to avoid NaN?
    false.pos.rate[t] <- 1 - specificity[t]
    n.preds[t] <- sum(round(pred, nchar(Nthresh) - 1) == thresholds[t])
    prop.preds[t] <- n.preds[t] / length(pred)
  }
  
  if (curve == "ROC") {
    xx <- false.pos.rate
    yy <- sensitivity
  } else {
    if (curve == "PR") {
      xx <- sensitivity
      yy <- precision
    } else {
      stop ("'curve' must be either 'ROC' or 'PR'.")
    }
  }
  
  if (method == "trapezoid") {
    xy <- na.omit(data.frame(xx, yy))
    #if (length(xx) != nrow(xy)) warning("Some non-finite values omitted from area calculation.")
    xx <- xy$xx
    yy <- xy$yy
    # next line adapted from https://stackoverflow.com/a/22418496:
    AUC <- sum(diff(xx) * (yy[-1] + yy[-length(yy)]) / 2)
    AUC <- -AUC  # euze
  }
  
    if (method == "integrate") {
    xx.interp <- stats::approx(x = thresholds, y = xx, n = length(thresholds))
    yy.interp <- stats::approx(x = thresholds, y = yy, n = length(thresholds))
    f <- approxfun(x = xx.interp$y, y = yy.interp$y)
    AUC <- integrate(f, lower = min(thresholds), upper = max(thresholds))$value
    }
  
  if (plot) {
    if (curve == "ROC") {
      if (xlab == "auto") xlab <- c("False positive rate", "(1-specificity)")
      if (ylab == "auto") ylab <- c("True positive rate", "(sensitivity)")
    }
    if (curve == "PR") {
      if (xlab == "auto") xlab <- c("Recall", "(sensitivity)")
      if (ylab == "auto") ylab <- c("Precision", "(positive predictive value)")
    }
    
    d <- ifelse(diag, "l", "n")  # to plot the 0.5 diagonal (or not if diag=FALSE)
    if (curve == "ROC") plot(x = c(0, 1), y = c(0, 1), type = d, xlab = xlab, ylab = ylab, col = diag.col, lty = diag.lty, ...)
    if (curve == "PR") plot(x = c(0, 1), y = c(1, 0), type = d, xlab = xlab, ylab = ylab, col = diag.col, lty = diag.lty, ...)
    
    if (grid) abline(h = thresholds, v = thresholds, col = "lightgrey")
    
    lines(x = xx, y = yy, col = curve.col, lty = curve.lty, lwd = curve.lwd)
    
    if (plot.preds == TRUE) plot.preds <- c("curve", "bottom")  # for back-compatibility
    if ("bottom" %in% plot.preds) {
      points(x = thresholds, y = rep(0, Nthresh), cex = 100 * prop.preds, col = "darkgrey")  # 20 * sqrt(prop.preds)
    }
    if ("curve" %in% plot.preds) {
      points(x = xx, y = yy, cex = 100 * prop.preds, col = "darkgrey")
    }
    
    if (plot.values) {
      if (curve == "ROC") place <- 0.4
      if (curve == "PR") place <- 1
      text(1, place, adj = 1, substitute(paste(AUC == a), list(a = round(AUC, plot.digits))))
    }  # end if plot.values
    
  }  # end if plot
  
  if (simplif)  return (AUC)
  
  thresholds.df <- data.frame(thresholds, true.positives, true.negatives, sensitivity, specificity, precision, false.pos.rate, n.preds, prop.preds)
  rownames(thresholds.df) <- thresholds
  
  return (list(thresholds = thresholds.df, N = N, prevalence = preval, AUC = AUC, AUCratio = AUC / 0.5))
}

EDIT: since modEvA version 1.7 (and as per updated code above), the AUC function can also compute the precision-recall curve.

Usage examples:

AUC(obs = mydata$myspecies, pred = mydata$myspecies_P, simplif = TRUE)
AUC(obs = mydata$myspecies, pred = mydata$myspecies_P)
AUC(model = mymodel, curve = "PR")

If you want the AUC values for a list of models produced e.g. by the multGLM function, you can do:

mymodels.auc <- vector("numeric", length(mymodels$models))
names(mymodels.auc) <- names(mymodels$models)
for (i in 1:length(mymodels$models)) {
mymodels.auc[i] <- AUC(model = mymodels$models[[i]], simplif = TRUE)
}

mymodels.auc

References:

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Lobo, J.M., Jiménez-Valverde, A. & Real, R. (2008). AUC: a misleading measure of the performance of predictive distribution models. Global Ecology and Biogeography 17: 145-151

Jiménez-Valverde, A., Acevedo, P., Barbosa, A.M., Lobo, J.M. & Real, R. (2013). Discrimination capacity in species distribution models depends on the representativeness of the environmental domain. Global Ecology and Biogeography 22: 508–516

Calculate the amount of deviance explained by a GLM

Posted on August 14, 2013 by A.M. Barbosa

Linear models come with an R-squared value that measures the proportion of variation that the model accounts for. The R-squared is provided with summary(model) in R. For generalized linear models (GLMs), the equivalent is the amount of deviance accounted for (D-squared; Guisan & Zimmermann 2000), but this value is not normally provided with the model summary. The Dsquared function, now included in the modEvA package (Barbosa et al. 2014), calculates it. There is also an option to calculate the adjusted D-squared, which takes into account the number of observations and the number of model parameters, thus allowing direct comparison among different models (Weisberg 1980, Guisan & Zimmermann 2000). UPDATE: see also further measures in the new RsqGLM function.

Dsquared <- function(model = NULL, 
                     obs = NULL, 
                     pred = NULL, 
                     family = NULL, # needed only when 'model' not provided
                     adjust = FALSE, 
                     npar = NULL) { # needed only when 'model' not provided
  # version 1.4 (31 Aug 2015)
 
  model.provided <- ifelse(is.null(model), FALSE, TRUE)
 
  if (model.provided) {
    if (!("glm" %in% class(model))) stop ("'model' must be of class 'glm'.")
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    obs <- model$y
    pred <- model$fitted.values
 
  } else { # if model not provided
    if (is.null(obs) | is.null(pred)) stop ("You must provide either 'obs' and 'pred', or a 'model' object of class 'glm'.")
    if (length(obs) != length(pred)) stop ("'obs' and 'pred' must be of the same length (and in the same order).")
    if (is.null(family)) stop ("With 'obs' and 'pred' arguments (rather than a model object), you must also specify one of two model family options: 'binomial' or 'poisson' (in quotes).")
    else if (!is.character(family)) stop ("Argument 'family' must be provided as character (i.e. in quotes: 'binomial' or 'poisson').")
    else if (length(family) != 1 | !(family %in% c("binomial", "poisson"))) stop ("'family' must be either 'binomial' or 'poisson' (in quotes).")
 
    if (family == "binomial") {
      if (any(!(obs %in% c(0, 1)) | pred < 0 | pred > 1)) stop ("'binomial' family implies that 'obs' data should be binary (with values 0 or 1) and 'pred' data should be bounded between 0 and 1.")
      link <- log(pred / (1 - pred))  # logit
    }  # end if binomial
 
    else if (family == "poisson") {
      if (any(obs %%1 != 0)) stop ("'poisson' family implies that 'obs' data should consist of whole numbers.")
      link <- log(pred)
    }  # end if poisson
 
    model <- glm(obs ~ link, family = family)
  }  # end if model not provided
 
  D2 <- (model$null.deviance - model$deviance) / model$null.deviance
 
  if (adjust) {
    if (model.provided) {
      n <- length(model$y)
      #p <- length(model$coefficients)
      p <- attributes(logLik(model))$df
    } else {
      if (is.null(npar)) stop ("Adjusted D-squared from 'obs' and 'pred' values (rather than a model object) requires specifying the number of parameters in the underlying model ('npar').")
      n <- length(na.omit(obs))
      p <- npar
    }  # end if model.provided else
 
    D2 <- 1 - ((n - 1) / (n - p)) * (1 - D2)
  }  # end if adjust
 
  return (D2)
}

[presented with Pretty R]

Usage examples:

Dsquared(model = mymodel)

Dsquared(model = mymodel, adjust = TRUE)

Dsquared(obs = myobs, pred= mypred, family = “poisson”)

Dsquared(obs = myobs, pred = mypred, family = “poisson”, adjust = TRUE, npar = 5)

References

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Guisan, A. & Zimmermann, N.E. (2000). Predictive habitat distribution models in ecology. Ecological Modelling 135: 147-186

Weisberg, S. (1980). Applied Linear Regression. Wiley, New York

Get the equation of a model to apply elsewhere

Posted on August 14, 2013 by A.M. Barbosa

The summary of a model in R gives you a table of the coefficient estimates and other parameters. Sometimes it may be useful to have a string of text with the model’s equation, so that you can present it in an article (e.g. Real et al. 2005) or apply it in a (raster map) calculation, either in R (although here you can usually use the ‘predict’ function) or in a GIS software (e.g. Barbosa et al. 2010). The getModEqn function, now included in the modEvA package (Barbosa et al. 2014), gets this equation for linear or generalized linear models.

By default it prints the “y” linear equation, but for generalized linear models you can also set type = “P” (for the equation of probability) or type = “F” (for favourability, which corrects the intercept to eliminate the effect of prevalence — see Real et al. 2006).

If the variables to which you want to apply the model have a prefix or suffix (e.g. prefix = “raster.stack$” for the R raster package, or prefix = “mydata$” for a data frame, or suffix = “@1” in Quantum GIS, or suffix = “@mapset” in GRASS), you can get these in the equation too, using the prefix and/or the suffix arguments.

getModEqn <-
function(model, type = "Y", digits = NULL, prefix = NULL, suffix = NULL) {
  # version 1.7 (4 Apr 2022)

  stopifnot(class(model) %in% c("lm", "glm"))
  
  if (length(type) != 1 | !(type %in% c("Y", "P", "F"))) stop("'type' must be either 'Y', 'P', or 'F'")
  if(!("glm" %in% class(model)) && type != "Y") {
    message("types 'P' and 'F' are only applicable to models of class 'glm', so type was reset to 'Y'")
    type <- "Y"
  }
  coeffs <- summary(model)$coefficients[ , 1]
  if (type == "F" & !("(Intercept)") %in% names(coeffs)) {
    message("'F' is currently only applicable to models with an intercept, so type was set to 'P'")
    type <- "P"
  }
  if (type == "F") {
    n1 <- sum(model$y == 1)
    n0 <- sum(model$y == 0)
    coeffs["(Intercept)"] <- coeffs["(Intercept)"] - log(n1/n0)
  }
  names(coeffs) <- paste(prefix, names(coeffs), suffix, sep = "")
  if (!is.null(digits)) coeffs <- signif(coeffs, digits)  # was previously 'round'
  coeffs <- ifelse(coeffs < 0, coeffs, paste("+", coeffs, sep = ""))
  multips <- paste(coeffs, names(coeffs), sep = "*")
  multips <- sub(x = multips, pattern = paste(prefix, "*(Intercept)", suffix, sep = ""), replacement = "", fixed = TRUE)
  eqn <- apply(X = as.matrix(multips), MARGIN = 2, FUN = paste, collapse = "")
  if (substring(eqn, 1, 1) == "+")  eqn <- substring(eqn, 2)
  if (type == "Y")  eqn <- paste("Y=", eqn, sep = "")
  if (type == "P")  eqn <- paste("P=1-(1/(1+exp(", eqn, ")))", sep = "")
  if (type == "F")  eqn <- paste("F=1-(1/(1+exp(", eqn, ")))", sep = "")
  cat(eqn)
  return(invisible(eqn))
}

Usage example:

data(rotif.mods)
mod <- rotif.mods$models[[1]]
getModEqn(mod, digits = 2)
#Y=-2.2+6.7e-07*Area+0.36*HabitatDiversity+1.1e-08*HumanPopulation-0.00062*Precipitation-0.012*PrecipitationSeasonality-0.00022*TemperatureSeasonality

References

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Barbosa, A.M., Real, R. & Vargas, J.M. (2010). Use of coarse-resolution models of species’ distributions to guide local conservation inferences. Conservation Biology 24: 1378–87

Real, R., Barbosa, A.M., Martínez-Solano, Í. & García-París, M. (2005). Distinguishing the distributions of two cryptic frogs (Anura: Discoglossidae) using molecular data and environmental modeling. Canadian Journal of Zoology 83: 536-545

Real, R., Barbosa, A.M. & Vargas, J.M. (2006). Obtaining environmental favourability functions from logistic regression. Environmental and Ecological Statistics 13: 237-245

Get the predictions of multiple models when applied to a new data set

Posted on July 24, 2013 by A.M. Barbosa

The getPreds function, now inluded in the fuzzySim package (Barbosa, 2015), can be useful if you have a list of model objects (e.g. resulting from multGLM) and want to apply them to a new data set (e.g. with variables from another region or time period), with options to include the logit link (y) and/or Favourability, in just one step. If you want Favourability to be calculated, you need also the Fav function. There is now also the option of using a RasterStack as the data argument, for which you’ll need to have the raster packages intalled and loaded.

getPreds <- function (data, models, id.col = NULL, Y = FALSE, P = TRUE, 
  Favourability = TRUE, incl.input = FALSE) 
{
  if (!Y & !P & !Favourability) 
    stop("There are no predictions to get\nif all Y, P and Favourability are set to FALSE.")
  start.time <- Sys.time()
  if (class(data) == "RasterStack") {
    preds <- stack()
    mod.count <- 0
    for (m in 1:length(models)) {
      mod.count <- mod.count + 1
      mod <- models[[m]]
      mod.name <- names(models)[m]
      message("Predicting with model ", mod.count, " of ", 
        length(models), " (", mod.name, ")...")
      if (Y == TRUE) {
        preds <- raster::addLayer(preds, raster::predict(data, 
          mod))
        names(preds)[raster::nlayers(preds)] <- paste(mod.name, 
          "Y", sep = "_")
      }
      if (P == TRUE | Favourability == TRUE) {
        p <- raster::predict(data, mod, type = "response")
        if (P == TRUE) {
          preds <- raster::addLayer(preds, p)
          names(preds)[raster::nlayers(preds)] <- paste(mod.name, 
            "P", sep = "_")
        }
      }
      if (Favourability == TRUE) {
        n1 <- sum(mod$y == 1)
        n0 <- sum(mod$y == 0)
        preds <- raster::addLayer(preds, (p/(1 - p))/((n1/n0) + 
          (p/(1 - p))))
        names(preds)[raster::nlayers(preds)] <- paste(mod.name, 
          "F", sep = "_")
      }
    }
    return(preds)
  }
  stopifnot(is.data.frame(data), is.list(models), is.null(id.col) | 
    id.col %in% (1:ncol(data)), is.logical(Y), is.logical(P), 
    is.logical(Favourability), is.logical(incl.input))
  input.data <- data
  keeP <- P
  if (Favourability) 
    P <- TRUE
  n.nulls <- length(models[sapply(models, is.null)])
  if (n.nulls > 0) 
    warning(n.nulls, " model(s) were NULL and therefore\n          did not generate predictions")
  models <- models[!sapply(models, is.null)]
  n.models <- length(models)
  mod.count <- 0
  for (m in 1:n.models) {
    mod.count <- mod.count + 1
    mod.name <- names(models)[m]
    message("Predicting with model ", mod.count, " of ", 
      n.models, " (", mod.name, ")...")
    if (Y) {
      data[, ncol(data) + 1] <- predict(models[[mod.count]], 
        data)
      names(data)[ncol(data)] <- paste(mod.name, "Y", 
        sep = "_")
    }
    if (P) {
      data[, ncol(data) + 1] <- predict(models[[mod.count]], 
        data, type = "response")
      names(data)[ncol(data)] <- paste(mod.name, "P", 
        sep = "_")
    }
    if (Favourability) {
      data[, ncol(data) + 1] <- Fav(pred = data[, ncol(data)], 
        n1n0 = c(sum(models[[mod.count]]$y == 1), sum(models[[mod.count]]$y == 
          0)))
      names(data)[ncol(data)] <- paste(mod.name, "F", 
        sep = "_")
      if (!keeP) 
        data <- data[, -(ncol(data) - 1)]
    }
  }
  if (incl.input) {
    id.col <- NULL
  }
  else {
    data <- data[, -(1:ncol(input.data)), drop = FALSE]
    if (!is.null(id.col)) {
      data <- data.frame(input.data[, id.col, drop = FALSE], 
        data)
      names(data)[1] <- names(input.data)[id.col]
    }
  }
  message("Finished!")
  timer(start.time)
  return(data)
}

[presented with hilite.me]

REFERENCES:

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Model calibration with the Hosmer & Lemeshow goodness-of-fit statistic

Posted on July 19, 2013 by A.M. Barbosa

The model evaluation functions posted here so far calculate only measures of discrimination capacity, i.e., how well the model is capable of distinguishing presences from absences (after the model’s continuous predictions of presence probability or alike are converted to binary predictions of presence or absence). However, there is another important facet of model evaluation: calibration or reliability, i.e., the relationship between predicted probability and observed prevalence (Pearce & Ferrier 2000; Jiménez-Valverde et al. 2013). The HLfit function, now included in the modEvA package (Barbosa et al. 2014), measures model reliability with the Hosmer & Lemeshow goodness-of-fit statistic (Hosmer & Lemeshow 1980). It also calculates the root mean square error (RMSE).

Note that these statistics have some limitations (see e.g. this post at Statistical Horizons), mainly due to the need to group the values into bins within which to compare probability and prevalence, and the influence of the binning method on the result. The HLfit function can use several binning methods, which are implemented in the getBins function provided with it (and also included in modEvA). You should therefore evaluate your model with different binning methods, to see if the results change significantly. The HLfit function uses also the prevalence function.

bin.methods c("round.prob", "prob.bins", "size.bins", "n.bins", "quantiles")
 
getBins function(obs = NULL, pred = NULL, model = NULL, id = NULL, bin.method = "quantiles", n.bins = 10, fixed.bin.size = FALSE, min.bin.size = 15, min.prob.interval = 0.1, simplif = FALSE) {
 
  # version 3.4 (11 Sep 2014)
  # obs: a vector of observed presences (1) and absences (0) or another binary response variable
  # pred: a vector with the corresponding predicted values of presence probability, habitat suitability, environmental favourability or alike
  # model: instead of (and overriding) obs and pred, you can provide a model object of class "glm"
  # id: optional vector of row identifiers; must be of the same length and in the same order of 'obs' and 'pred' (or of the cases used to build 'model')
 
  if (!is.null(model)) {
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    obs }  # end if model
 
  stopifnot(
    length(obs) == length(pred),
    !(NA %in% obs),
    !(NA %in% pred),
    obs %in% c(0, 1),
    pred >= 0,
    pred <= 1,
    is.null(id) | length(id) == length(pred),
    n.bins >= 2,
    min.bin.size >= 0,
    min.prob.interval > 0,
    min.prob.interval < 1
  )
 
  if(!(bin.method %in% modEvAmethods("getBins"))) stop("Invalid bin.method; type modEvAmethods('getBins') for available options.")
 
  N length(obs)
 
  # now get prob.bin (probability bin to which each point belongs):
  if (bin.method == "round.prob") {  # flaw: assymmetric (up to 11 bins, the first and last with half the prob range)
    message("Arguments n.bins and min.bin.size are ignored by this bin.method")
    prob.bin round(pred, digits = nchar(min.prob.interval) - 2)
  }  # end method round.prob
 
  if (bin.method == "prob.bins") {  # flaw: may generate empty or very small bins
    message("Arguments n.bins and min.bin.size are ignored by this bin.method")
    bin.cuts seq(from = 0, to = 1, by = min.prob.interval)
    prob.bin findInterval(pred, bin.cuts)
  }  # end method prob.bins
 
  if (bin.method == "size.bins") {
    message("Arguments n.bins and min.prob.interval are ignored by this bin.method")
    bin.method "n.bins"
    n.bins floor(N / min.bin.size)
    fixed.bin.size TRUE
  }  # end method size.bins
 
  if (bin.method == "n.bins") {   # note: bins do not have constant size (and some get less than min size)
    message("Argument min.prob.interval is ignored by this bin.method")
    if (fixed.bin.size) {
      prob.bin findInterval(pred, quantile(pred, probs = seq(from = 0, to = 1, by = 1 / n.bins)))
    } else {
      prob.bin cut(pred, n.bins)
    }
  }  # end method n.bins
 
  if (bin.method == "quantiles") {
    # quantile binning based on 'hosmerlem' function by Peter D. M. Macdonald (http://www.stat.sc.edu/~hitchcock/diseaseoutbreakRexample704.txt)
    cutpoints quantile(pred, probs = seq(0, 1, 1/n.bins))
    prob.bin findInterval(pred, cutpoints)
  }
 
  prob.bins sort(unique(prob.bin))  # lists the probability bins in the model
 
  bins.table t(as.data.frame(unclass(table(obs,prob.bin))))
  bins.table data.frame(rowSums(bins.table), bins.table[ , c(2, 1)])
  colnames(bins.table) c("N","N.pres","N.abs")
  bins.table$prevalence with(bins.table, N.pres / N)
  bins.table$mean.prob tapply(pred, prob.bin, mean)
  bins.table$median.prob tapply(pred, prob.bin, median)
  bins.table$difference with(bins.table, mean.prob - prevalence)
  bins.table$max.poss.diff ifelse(bins.table$mean.prob < 0.5, 1 - bins.table$mean.prob, bins.table$mean.prob)
  bins.table$adj.diff with(bins.table, abs(difference - max.poss.diff))
  bins.table$overpred apply(bins.table[ ,c("prevalence", "mean.prob")], 1, max)
  bins.table$underpred apply(bins.table[ ,c("prevalence", "mean.prob")], 1, min)
 
  bins.table [bins.table$N > 0, ]  # eliminates empty bins (which impede calculations)
 
  if(min(as.data.frame(bins.table)$N) < 15) warning("There is at least one bin with less than 15 values, for which comparisons may not be meaningful; consider using a bin.method that allows defining a minimum bin size")
  n.bins nrow(bins.table)
 
  return(list(prob.bin = prob.bin, bins.table = bins.table, N = N, n.bins = n.bins))
 
}

HLfit <- function (model = NULL, obs = NULL, pred = NULL, bin.method, n.bins = 10, fixed.bin.size = FALSE, min.bin.size = 15, min.prob.interval = 0.1, simplif = FALSE, alpha = 0.05, plot = TRUE, plot.values = TRUE, plot.bin.size = TRUE, xlab = "Predicted probability", ylab = "Observed prevalence", ...) {
  # version 1.5 (24 Jun 2015)
 
  if (!is.null(model)) {
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    obs <- model$y
    pred <- model$fitted.values
  }  # end if model
 
  stopifnot(
    length(obs) == length(pred),
    obs %in% c(0, 1),
    pred >= 0,
    pred <= 1
  )
 
  bins <- getBins(obs = obs, pred = pred, bin.method = bin.method, n.bins = n.bins, fixed.bin.size = fixed.bin.size, min.bin.size = min.bin.size, min.prob.interval = min.prob.interval)
  n.bins <- nrow(bins$bins.table)
 
  # next 4 lines: adapted from hosmerlem function in http://www.stat.sc.edu/~hitchcock/diseaseoutbreakRexample704.txt
  observed <- xtabs(cbind(1 - obs, obs) ~ bins$prob.bin)
  expected <- xtabs(cbind(1 - pred, pred) ~ bins$prob.bin)
  chi.sq <- sum((observed - expected) ^ 2 / expected)
  p.value <- 1 - pchisq(chi.sq, df = n.bins - 2)
  rmse <- sqrt(mean((observed - expected) ^ 2))
 
  if (simplif) return(list(chi.sq = chi.sq, p.value = p.value, RMSE = rmse))
 
  # plotting loosely based on calibration.plot function in package PresenceAbsence
  if (plot) {
    N.total <- tapply(obs, bins$prob.bin, length)  # N cases in each bin
    N.presence <- tapply(obs, bins$prob.bin, sum)  # N presences in each bin
    Empty <- is.na(N.total)
    N.total[is.na(N.total)] <- 0
    N.presence[is.na(N.presence)] <- 0
    OBS.proportion <- N.presence / N.total
    OBS.proportion[Empty] <- NA
    df1.low <- 2 * (N.total - N.presence + 1)
    df2.low <- 2 * N.presence
    df1.up <- 2 * (N.presence + 1)
    df2.up <- 2 * (N.total - N.presence)
    N.bins <- length(unique(bins$prob.bin))  # fui eue
    Lower <- rep(0, N.bins)
    Upper <- rep(1, N.bins)
    TF <- N.presence != 0
    Lower[TF] <- N.presence[TF]/(N.presence[TF] + ((N.total[TF] - N.presence[TF] + 1) * qf(1 - alpha/2, df1.low[TF], df2.low[TF])))
    Lower[Empty] <- NA
    TF <- N.presence < N.total
    Upper[TF] <- ((N.presence[TF] + 1) * qf(1 - alpha/2, df1.up[TF], df2.up[TF]))/(N.total[TF] - N.presence[TF] + ((N.presence[TF] + 1) * qf(1 - alpha/2, df1.up[TF], df2.up[TF])))
    Upper[Empty] <- NA
    plot(c(-0.05, 1.05), c(-0.05, 1.05), type = "n", xlab = xlab, ylab = ylab, ...)
    bin.centers <- bins$bins.table$median.prob  # fui eue
 
    if (plot.bin.size) {
      text(bin.centers, Upper + 0.07, labels = N.total)
    }
 
    abline(a = 0, b = 1, lty = 2)
    for (i in 1:N.bins) {
      lines(rep(bin.centers[i], 2), c(Lower[i], Upper[i]))
    }
    points(bin.centers, OBS.proportion, pch = 20)
 
    if (plot.values) {
      text(1, 0.2, adj = 1, substitute(paste(HL == a), list(a = round(chi.sq, 1))))
      if (p.value < 0.001) {
        text(1, 0.1, adj = 1, substitute(paste(italic(p) < a), list(a = 0.001)))
      } else {
        text(1, 0.1, adj = 1, substitute(paste(italic(p) == a), list(a = round(p.value, 3))))
      }
      text(1, 0.0, adj = 1, substitute(paste(RMSE == a), list(a = round(rmse, 1))))
    }  # end if plot values
  }
 
  BinPred <- tapply(pred, bins$prob.bin, mean)
  bins.table <- data.frame(BinCenter = bin.centers, NBin = N.total, BinObs = OBS.proportion, BinPred = BinPred, BinObsCIlower = Lower, BinObsCIupper = Upper)
 
  return(list(bins.table = bins.table, chi.sq = chi.sq, DF = n.bins - 2, p.value = p.value, RMSE = rmse))
}

[presented with Pretty R]

HLfit still needs some code simplification, but fixes have recently been applied to the getBins function so that it doesn’t fail when probability breakpoints are not unique. Additional model calibration measures are soon to be added to the package.

Usage examples:

HLfit(model = mymodel, bin.method = “prob.bins”, min.prob.interval = 0.1)

HLfit(obs= myobs, pred = mypred, bin.method = “n.bins”, n.bins = 10)

HLfit(obs= myobs, pred = mypred, bin.method = “quantiles”)

References:

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Hosmer, D.W. & Lemeshow, S. (1980). A goodness-of-fit test for the multiple logistic regression model. Communications in Statistics, A10: 1043–1069

Pearce, J. & Ferrier, S. (2000). Evaluating the Predictive Performance of Habitat Models Developed using Logistic Regression. Ecological Modeling, 133: 225 – 245

Optimization of model thresholds based on a pair of measures

Posted on June 19, 2013 by A.M. Barbosa

The optiPair function, now included in the modEvA package (Barbosa et al. 2014), follows the optiThresh, threshMeasures and evaluate functions, which should all be loaded along; and extends (thus replacing) the SensSpecCurve function (now deprecated), in the sense that optiPair can be used with any pair of model evaluation measures that balance each other, such as sensitivity-specificity, omission-commission, or underprediction-overprediction (featured in Barbosa et al. 2013). The function plots both measures in the given pair against all thresholds, and calculates the optimal sum, difference and mean of the two measures.

optiPair <-
function (obs = NULL, pred = NULL, model = NULL, 
          measures = c("Sensitivity", "Specificity"), 
          interval = 0.01, plot = TRUE, plot.sum = FALSE, 
          plot.diff = FALSE, ylim = NULL, ...) {
  # version 1.5 (3 Apr 2014)
  # obs: a vector of observed presences (1) and absences (0) or another binary response variable
  # pred: a vector with the corresponding predicted values of presence probability, habitat suitability, environmental favourability or alike
  # model: instead of (and overriding) obs and pred, a model object of class "glm"
  # measures: the pair of measures whose curves to plot, e.g. c("Sensitivity", "Specificity")
  # interval: the interval of thresholds at which to calculate sensitivity and specificity
  # plot: logical indicating whether or not to plot the measures
  # plot.sum and plot.diff will optionally plot the sum (+) and/or the difference (-) between both measures in the pair
  # ... : additional arguments to be passed to the "plot" function
 
  if(length(measures) != 2) stop ("'measures' must contain two elements.")
 
  if(is.null(model)) {
    if (is.null(obs) | is.null(pred)) stop("You must provide either the 'obs' 
and 'pred' vectors, or a 'model' object of class 'glm'")
  }  # end if null model
  else {
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    obs <- model$y
    pred <- model$fitted.values
    model <- NULL  # so the message is not repeated for each threshold
  }  # end if model
 
  if (NA %in% obs | NA %in% pred) stop("Please remove (rows with) NA 
                                       from your data")
  if (length(obs) != length(pred)) stop("'obs' and 'pred' must have the same 
                                        number of values (and in the same order)")
  if (!all(obs %in% c(0, 1))) stop ("'obs' must consist of binary observations 
                                    of 0 or 1")
  if (any(pred < 0 | pred > 1)) stop ("'pred' must range between 0 and 1")
 
  measures.values <- optiThresh(obs = obs, pred = pred, interval = interval, measures = measures, optimize = "each", simplif = TRUE)
  measures.values$Difference <- abs(measures.values[ , 1] - measures.values[ , 2])
  measures.values$Sum <- rowSums(measures.values[ , 1:2])
  measures.values$Mean <- rowMeans(measures.values[ , 1:2])
  measures.values$Threshold <- as.numeric(rownames(measures.values))
 
  MinDiff <- min(measures.values$Difference)
  MaxSum <- max(measures.values$Sum)
  MaxMean <- max(measures.values$Mean)
 
  ThreshDiff <- with(measures.values, Threshold[which.min(Difference)])
  ThreshSum <- with(measures.values, Threshold[which.max(Sum)])
  ThreshMean <- with(measures.values, Threshold[which.max(Mean)])
 
  if (plot) {
 
    finite <- as.matrix(measures.values[ , 1:2])
    finite <- finite[is.finite(finite)]
    if(is.null(ylim)) {
      if(plot.sum) ylim <- c(min(finite), MaxSum)
      else ylim <- c(min(finite), max(finite))
    }  # end if null ylim
 
    plot(measures.values[ , 1] ~ measures.values$Threshold, pch = 20, xlab = "Threshold", ylab = measures, ylim = ylim, ...)
    # title(ylab = measures, col.lab = c("black", "grey"))  # error: "col.lab" has the wrong length
    mtext(measures[1], side = 2, line = 3)
    points(measures.values[ , 2] ~ measures.values$Threshold, pch = 20, col = "darkgrey")
    mtext(measures[2], side = 2, line = 2, col = "darkgrey")
 
    abline(h = measures.values[which.min(measures.values$Difference), 1], col = "lightgrey", lty = 2)
    abline(v = ThreshDiff, col = "lightgrey", lty = 2)
 
    if (plot.sum) {
      with(measures.values, points(Sum ~ Threshold, pch = '+'))
      abline(v = ThreshSum, col = "lightgrey", lty = 2)
      abline(h = MaxSum, col = "lightgrey", lty = 2)
    }    # end if plot sum
 
    if (plot.diff) {
      with(measures.values, points(Difference ~ Threshold, pch = '-', col = "grey"))
      abline(v = ThreshDiff, col = "grey", lty = 2)
      abline(h = MinDiff, col = "grey", lty = 2)
    }  # end if plot diff
  }  # end if plot
 
  return(list(measures.values = measures.values, MinDiff = MinDiff, ThreshDiff = ThreshDiff, MaxSum = MaxSum, ThreshSum = ThreshSum, MaxMean = MaxMean, ThreshMean = ThreshMean))
 
}

[Created by Pretty R at inside-R.org]

To use it, just load the required functions and replace the names of your data frame, presence-absence data and predicted values (or else use a “glm” model object directly) in either of the folowing commands:

optiPair(obs = mydata$myspecies, pred = mydata$myspecies_P, measures = c(“Sensitivity”, “Specificity”))

optiPair(model = mymodel, measures = c(“UPR”, “OPR”))

References

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Barbosa, A.M., Real, R., Muñoz, A.-R. & Brown, J.A. (2013) New measures for assessing model equilibrium and prediction mismatch in species distribution models. Diversity and Distributions, online early (DOI: 10.1111/ddi.12100).

Standardize a value between -1 and 1 to its corresponding value in the 0-to-1 scale (or vice-versa)

Posted on June 18, 2013 by A.M. Barbosa

While most of the threshold-based measures of model evaluation range theoretically from 0 to 1, some of them (such as Cohen’s kappa and the true skill statistic TSS) may range from -1 to 1 (Allouche et al. 2006). Thus, the values of different measures may not be directly comparable. We don’t usually get negative values of TSS or kappa (nor values under 0.5 for CCR or AUC, for example) because that only happens when model predictions perform worse than random guesses; still, such values are mathematically possible, and can occur e.g. when extrapolating models to regions where where the species-environment relationships differ.

The standard01 function, now included in the modEvA package (Barbosa et al. 2014), converts the score of a measure that ranges from -1 to 1 (e.g. a kappa or TSS value obtained for a model) into its corresponding value in 0-to-1 scale, so that it can be compared directly with measures that range between 0 and 1 (such as CCR or AUC). This standardization is now included as an option in the threshMeasures function, which had been producing slightly unfair barplots of measure comparison. The default direction of the standardizaion is “-1+1to01”, but the function can also perform the conversion backwards by setting direction = “01to-1+1”.

standard01 <-
function(score, direction = c("-1+1to01", "01to-1+1")) {
  # version 1.1 (11 June 2014)
  # standardizes a value between -1 and 1 to its corresponding value in the 0-1 scale, or vice-versa
 
  direction <- match.arg(direction, c("-1+1to01", "01to-1+1"))
 
  if (direction == "-1+1to01") {
    if (score < -1 | score > 1) stop("'score' must be between -1 and 1")
    std.score <- score + ((1 - score) / 2)
  }  # end if -1+1to01
  else {
    if (score < 0 | score > 1) stop("'score' must be between 0 and 1 to standardize in this direction")
    std.score <- 2 * (score - 0.5)
  }
  return(std.score)
}

[presented with Pretty R]

To try it, just give it a value between -1 and 1 (or between 0 and 1 if converting in the opposite direction) to get the result:

standard01(0.6)

standard01(0.6, direction = “-1+1to01”)

standard01(0.6, direction = “01to-1+1”)

Note that this is not the same as re-scaling and stretching a vector of values so that they range between 0 and 1 (i.e., the lowest value becomes 0, the highest becomes 1, and the ones in the midlle retain their rank and relative diference). This can be achieved with the range01 function, which I got from here and adapted to handle also missing values. This function is also included in modEvA.

range01 <- function(x, na.rm = TRUE) {
  # version 1.1 (9 Aug 2013)
  (x - min(x, na.rm = na.rm)) / (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))
}

range01(0:10)

range01(-12.3 : 21.7)

References:

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Allouche, O., Tsoar, A. & Kadmon, R. (2006) Assessing the accuracy of species distribution models: prevalence, kappa and the true skill statistic (TSS). Journal of Applied Ecology, 43, 1223–1232.

Prevalence and evenness in binary data

Posted on June 17, 2013 by A.M. Barbosa

For building and evaluating species distribution models, the porportion of presences of the species and the balance between the number of presences and absences may be issues to take into account (e.g. Jiménez-Valverde & Lobo 2006, Barbosa et al. 2013). The prevalence and the evenness functions, included in the modEvA package (Barbosa et al. 2014), can calculate these measures:

prevalence <- function (obs, event = 1) {
  # version 1.0
  # calculates the prevalence (proportion of occurrences) of a value (event) in a vector
  # obs: a vector of binary observations (e.g. 1 vs. 0, male vs. female, disease vs. no disease, etc.)
  # event: the value whose prevalence we want to calculate (e.g. 1, "present", etc.)
  sum(obs == event) / length(obs)
}  # end prevalence function

evenness <- function (obs) {
  # version 1.3 (18 June 2013)
  # calculates the evenness (equilibrium) of cases in a binary vector; result ranges from 0 when all values are the same, to 1 when there are the same number of cases with each value
  # obs: a vector of binary observations (e.g. 1 vs. 0, male vs. female, disease vs. no disease, etc.)
  values <- unique(obs)
  nvalues <- length(values)
  if (!(nvalues %in% c(1, 2))) stop("Input vector includes ", nvalues, " different values; 'evenness' is only implemented for binary observations (with 1 or 2 different values).")
  proportion <- (sum(obs == values[1])) / length(obs)
  if (proportion > 0.5)  balance <- 1 - proportion  else  balance <- proportion
  return(2 * balance)  # so evenness ranges between 0 and 1
}  # end evenness function

Let’s exemplify with 3 sample binary vectors x, y and z:

(x <- rep(c(0, 1), each = 5))
(y <- c(rep(0, 3), rep(1, 7)))
(z <- c(rep(0, 7), rep(1, 3)))

prevalence(x)
evenness(x)

prevalence(y)
evenness(y)

prevalence(z)
evenness(z)

[presented with Pretty R]

References

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Barbosa A.M., Real R., Muñoz A.R. & Brown J.A. (2013) New measures for assessing model equilibrium and prediction mismatch in species distribution models. Diversity and Distributions, 19: 1333-1338

Jiménez-Valverde A. & Lobo J.M. (2006) The ghost of unbalanced species distribution data in geographical model predictions. Diversity and Distributions, 12: 521–524.

Upscale UTM 10 x 10 km cells to 20, 50 or 100 km

Posted on May 9, 2013 by A.M. Barbosa

If you have a map with/or a data table of UTM 10 x 10 km cells with the corresponding UTM codes, and want to upscale them to 20 x 20, 50 x 50 or 100 x 100 km cells, you can use the UTM10upscale function to create cell codes that group the UTM codes accordingly. This function is not included in an R package and is used as a stand-alone.

UTM10upscale <- function(UTMcode, size) {
  # version 1.3 (14 Mai 2013)
  # UTMcode: a vector string of UTM 10x10 km codes (e.g. MC58 or 29SMC58)
  # size: the size (in square km) of the UTM cells for which to create codes; must be 20, 50 or 100
  if (missing(size)) stop ("Argument 'size' is missing, with no default.")
  if (!(size %in% c(20, 50, 100))) stop ("'size' must be a multiple of 10 and a divisor of 100 - i.e. 20, 50 or 100.")
  UTMcode <- as.character(UTMcode)
  nc <- unique(nchar(UTMcode))
  if (length(nc) != 1) stop ("All elements of UTMcode must have the same number of characters.")
  utm10letters <- substr(UTMcode, nc - 4, nc -2)
  if(size == 100)  upscaled.code <- utm10letters
  else {
    utm10x <- substr(UTMcode, nc - 1, nc - 1)  # penultimate digit of UTMcode
    utm10y <- substr(UTMcode, nc, nc)  # last digit of UTMcode
    utm10index <- 0 : 9
    if (size == 20)  upscaled.index <- rep(letters[1 : 5], each = 2)
    else if (size == 50) upscaled.index <- rep(LETTERS[1 : 2], each = 5)
    n <- length(UTMcode)
    upscaled.x <- upscaled.y <- vector("integer", n)
    for (i in 1 : n) {
      x <- which(utm10index == utm10x[i])
      y <- which(utm10index == utm10y[i])
      upscaled.x[i] <- upscaled.index[x]
      upscaled.y[i] <- upscaled.index[y]
    }  # end for i
    upscaled.code <- paste(utm10letters, upscaled.x, upscaled.y, sep = "")
    }  # end if size 100 else
  return(upscaled.code)
}  # end UTM10upscale function

[presented with Pretty R]

Note that the resulting codes for 20×20 and for 50×50 km cells (namely the last two characters) are not “official” UTM codes, just ones that I made up. I used letters instead of numbers, with lower case for 20-km and upper case for 50-km cells, so that these codes cannot be confused with the UTM 10×10 km codes.

Then you just have to add a column with the new codes to your map’s table of attributes, and use e.g. a spatial R package or a GIS to dissolve the UTM 10 x 10 km cells using the values in this column. You can also use R’s aggregate function to summarize your data into the upscaled cells.

mydata$utm20 <- utm10upscale(mydata$utm10, size = 20)
mydata$utm50 <- utm10upscale(mydata$utm10, size = 50)

Turn a list of species by site into a species presence-absence table

Posted on April 30, 2013 by A.M. Barbosa

Imagine you have a data frame with the species that are present in each locality, such as this:

locality      species
Cuba          Pancake bush
Sumatra       Pancake bush
Greenland     Red dwarf
South America Pancake bush
South America Tree whale
Antarctica    Red dwarf

…but what you need is a data frame with one species per column and their presence (1) or absence (0) in each of the sites (rows), such as this:

    locality Pancake.bush Red.dwarf Tree.whale
     Africa            1         0          0
 Antarctica            1         1          0
       Asia            1         0          0
  Australia            1         1          0
     Baffin            1         1          1
      Banks            1         1          0

You can use R’s table function for this, but it can take a bit of fiddling to get the result in an easily manageable format. The splist2presabs function, now included in the fuzzySim package (Barbosa 2014), does this in one step:

splist2presabs <- function(data, sites.col, sp.col, keep.n = FALSE) {
  # version 1.1 (7 May 2013)
  # data: a matrix or data frame with your localities and species (each in a different column)
  # sites.col: the name or index number of the column containing the localities
  # sp.col: the name or index number of the column containing the species names or codes
  # keep.n: logical, whether to get in the resulting table the number of times each species appears in each locality; if false (the default), only the presence (1) or absence (0) are recorded

  stopifnot(
    length(sites.col) == 1,
    length(sp.col) == 1,
    sites.col != sp.col,
    sites.col %in% 1 : ncol(data) | sites.col %in% names(data),
    sp.col %in% 1 : ncol(data) | sp.col %in% names(data),
    is.logical(keep.n)
  )

  presabs <- table(data[ , c(sites.col, sp.col)])
  presabs <- as.data.frame(unclass(presabs))
  if (!keep.n)  presabs[presabs > 1] <- 1
  presabs <- data.frame(row.names(presabs), presabs)
  names(presabs)[1] <- names(subset(data, select = sites.col))
  rownames(presabs) <- NULL
  return(presabs)
}  # end splist2presabs function

To try an example, load the splist2presabs function (above) and then do the following:

# get a set of localities and some fake species:
loc <- names(islands)
spp <- c("Pancake bush", "Tree whale", "Red dwarf")
# combine them in a data frame:
data <- data.frame(locality = sample(loc, 100, replace = TRUE), species = sample(spp, 100, replace = TRUE))
# take a look at the data:
head(data)
# turn these into a presence-absence data frame and check it out:
atlas <- splist2presabs(data, sites.col = 1, sp.col = 2)
head(atlas)
# if you'd rather have columns with shorter names, load the spCodes function and do:
data$spcode <- spCodes(data$species)
head(data)
atlas <- splist2presabs(data, sites.col = 1, sp.col = 3)
head(atlas)

[presented with Pretty R]

REFERENCES:

Barbosa A.M. (2014) fuzzySim: Fuzzy similarity in species’ distributions. R package, version 0.1.

modTools

simple tools for simpler modelling

Calculate the area under the ROC curve of a model

Calculate the amount of deviance explained by a GLM

Get the equation of a model to apply elsewhere

Get the predictions of multiple models when applied to a new data set

Model calibration with the Hosmer & Lemeshow goodness-of-fit statistic

Optimization of model thresholds based on a pair of measures

Standardize a value between -1 and 1 to its corresponding value in the 0-to-1 scale (or vice-versa)

Prevalence and evenness in binary data

Upscale UTM 10 x 10 km cells to 20, 50 or 100 km

Turn a list of species by site into a species presence-absence table