--- title: "Saving MHW Results to NetCDF" author: "Robert W Schlegel" date: "`r Sys.Date()`" description: "This vignette demonstrates how to save marine heatwaves (MHWs) detected from gridded data as a NetCDF file." output: rmarkdown::html_vignette: fig_caption: yes vignette: > %\VignetteIndexEntry{Saving MHW Results to NetCDF} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r global_options, include = FALSE} knitr::opts_chunk$set(fig.width = 4, fig.align = 'center', echo = TRUE, warning = FALSE, message = FALSE, eval = FALSE, tidy = FALSE) ``` ## Overview In the previous vignette we saw how to [detect marine heatwaves (MHWs) in gridded data](https://robwschlegel.github.io/heatwaveR/articles/gridded_event_detection.html). In this vignette we will use those gridded MHW results to see how to save them as a NetCDF file. ```{r load-pkg} library(dplyr) # For basic data manipulation library(ncdf4) # For creating NetCDF files library(tidync) # For easily dealing with NetCDF data library(ggplot2) # For visualising data library(doParallel) # For parallel processing ``` ## Loading data Please see the [downloading and preparing OISST data](https://robwschlegel.github.io/heatwaveR/articles/OISST_preparation.html) and [detecting marine heatwaves (MHWs) in gridded data](https://robwschlegel.github.io/heatwaveR/articles/gridded_event_detection.html) vignettes first if you have not yet worked through them. We will be using the MHW results created from those two vignettes below. ```{r load} MHW_res_grid <- readRDS("~/Desktop/MHW_result.Rds") ``` ## Prepare the data The main sticking point between MHW results and the NetCDF file format in R is that NetCDF files will only accept the file type in R known as "arrays", and most R outputs are data.frames. So the majority of what we need to do is to convert our data from data.frames to arrays. There are many ways to do this and a search of the interwebs will produce a plethora of results. Over the years I have settled into an approach that I use operationally for the MHW Tracker that I will also use here. It is not necessarily the fastest method, nor does it use the fewest lines of code possible, but it does follow the **`tidyverse`** approach to programming, and I think it is about as transparent as this process can be. We will create two functions below that will help us along in this process. ```{r data-prep} # Function for creating arrays from data.frames df_acast <- function(df, lon_lat){ # Force grid res <- df %>% right_join(lon_lat, by = c("lon", "lat")) %>% arrange(lon, lat) # Convert date values to integers if they are present if(lubridate::is.Date(res[1,4])) res[,4] <- as.integer(res[,4]) # Create array res_array <- base::array(res[,4], dim = c(length(unique(lon_lat$lon)), length(unique(lon_lat$lat)))) dimnames(res_array) <- list(lon = unique(lon_lat$lon), lat = unique(lon_lat$lat)) return(res_array) } # Wrapper function for last step before data are entered into NetCDF files df_proc <- function(df, col_choice){ # Determine the correct array dimensions lon_step <- mean(diff(sort(unique(df$lon)))) lat_step <- mean(diff(sort(unique(df$lat)))) lon <- seq(min(df$lon), max(df$lon), by = lon_step) lat <- seq(min(df$lat), max(df$lat), by = lat_step) # Create full lon/lat grid lon_lat <- expand.grid(lon = lon, lat = lat) %>% data.frame() # Acast only the desired column dfa <- plyr::daply(df[c("lon", "lat", "event_no", col_choice)], c("event_no"), df_acast, .parallel = T, lon_lat = lon_lat) return(dfa) } # We must now run this function on each column of data we want to add to the NetCDF file doParallel::registerDoParallel(cores = 7) prep_dur <- df_proc(MHW_res_grid, "duration") prep_max_int <- df_proc(MHW_res_grid, "intensity_max") prep_cum_int <- df_proc(MHW_res_grid, "intensity_cumulative") prep_peak <- df_proc(MHW_res_grid, "date_peak") ``` ## Create NetCDF shell With our data prepared into a series of arrays, we now need to set the stage for our NetCDF file. The important thing here is that the dimensions of the arrays we created match up to the dimensions of the NetCDF file. ```{r nc-shell} # Get file attributes lon_step <- mean(diff(sort(unique(MHW_res_grid$lon)))) lat_step <- mean(diff(sort(unique(MHW_res_grid$lat)))) lon <- seq(min(MHW_res_grid$lon), max(MHW_res_grid$lon), by = lon_step) lat <- seq(min(MHW_res_grid$lat), max(MHW_res_grid$lat), by = lat_step) event_no <- seq(min(MHW_res_grid$event_no), max(MHW_res_grid$event_no), by = 1) tunits <- "days since 1970-01-01" # Length of each attribute nlon <- length(lon) nlat <- length(lat) nen <- length(event_no) ``` ## Create NetCDF file The last step in this process is to add the prepared data to the NetCDF shell we have constructed. The **`ncdf4`** package helps to make this all a relatively straight forward process. ```{r nc-make} # Path and file name # NB: We are net setting time dimensions here because we are using event_no as our "time" dimension ncpath <- "~/Desktop/" ncname <- "MHW_results" ncfname <- paste0(ncpath, ncname, ".nc") # dname <- "tmp" # Define dimensions londim <- ncdim_def("lon", "degrees_east", lon) latdim <- ncdim_def("lat", "degrees_north", lat) endim <- ncdim_def("event_no", "event_number", event_no) # timedim <- ncdim_def("time", tunits, as.double(time)) # Define variables fillvalue <- 1e32 def_dur <- ncvar_def(name = "duration", units = "days", dim = list(endim, latdim, londim), missval = fillvalue, longname = "duration of MHW", prec = "double") def_max_int <- ncvar_def(name = "max_int", units = "deg_c", dim = list(endim, latdim, londim), missval = fillvalue, longname = "maximum intensity during MHW", prec = "double") def_cum_int <- ncvar_def(name = "cum_int", units = "deg_c days", dim = list(endim, latdim, londim), missval = fillvalue, longname = "cumulative intensity during MHW", prec = "double") def_peak <- ncvar_def(name = "date_peak", units = tunits, dim = list(endim, latdim, londim), missval = 0, longname = "date of peak temperature anomaly during MHW", prec = "integer") # Create netCDF file and prepare space for our arrays ncout <- nc_create(ncfname, list(def_peak, def_dur, def_max_int, def_cum_int), force_v4 = TRUE) # Add the actual data ncvar_put(ncout, def_dur, prep_dur) ncvar_put(ncout, def_max_int, prep_max_int) ncvar_put(ncout, def_cum_int, prep_cum_int) ncvar_put(ncout, def_peak, prep_peak) # Put additional attributes into dimension and data variables ncatt_put(ncout, "lon", "axis", "X") #,verbose=FALSE) #,definemode=FALSE) ncatt_put(ncout, "lat", "axis", "Y") ncatt_put(ncout, "event_no", "axis", "event_no") # Add global attributes # These are useful for whomever else may want to use your NetCDF file ncatt_put(ncout, 0, "title", paste0("MHW results from lon: ", min(lon)," to ",max(lon), " and lat: ",min(lat)," to ",max(lat))) ncatt_put(ncout, 0, "institution", "Your institution here!") ncatt_put(ncout, 0, "source", "NOAA OISST v2.1") ncatt_put(ncout,0, "references", "Banzon et al. (2020) J. Atmos. Oce. Tech. 37:341-349") history <- paste0("Your name here!, ", date()) ncatt_put(ncout, 0, "history", history) ncatt_put(ncout, 0, "Conventions", "Hobday et al. (2016)") # Assuming one has used the Hobday definition # Get a summary of the created file: ncout ``` ## Visualising the results Let's not stop there. To ensure that the NetCDF file was created correctly we want to load it back into our workspace and plot the results. ```{r res-vis} # Convenience function for comparing files quick_grid <- function(df, var_choice){ df %>% filter(event_no == 13) %>% ggplot(aes(x = lon, y = lat)) + geom_raster(aes_string(fill = var_choice)) + coord_cartesian(expand = F) + scale_fill_viridis_c() + theme(legend.position = "bottom") } # Thanks to the tidync package, loading the gridded data is very simple MHW_res_nc <- tidync("~/Desktop/MHW_results.nc") %>% hyper_tibble() %>% mutate(date_peak = as.Date(date_peak, origin = "1970-01-01")) # Plot the duration results quick_grid(MHW_res_grid, "duration") quick_grid(MHW_res_nc, "duration") # Cumulative intensity quick_grid(MHW_res_grid, "intensity_cumulative") quick_grid(MHW_res_nc, "cum_int") ```