1 Setup

Install required packages (only if the packages have not been installed before):

installed <- rownames(installed.packages())
required <- c("tidyverse",
              "tidylog",
              "here",
              "readxl",
              "sf",
              "DBI",
              "RSQLite",
              "digest",
              "testthat"
)
if (!all(required %in% installed)) {
  install.packages(required[!required %in% installed])
}

Load packages:

library(tidyverse)      # To do data science
library(tidylog)        # To provide feedback on dplyr functions
library(here)           # To find files
library(readxl)         # To read Excel files
library(sf)             # To convert coordinate systems
library(DBI)            # To work with databases
library(RSQLite)        # To work with SQLite databases in R
library(digest)         # To create hashes
library(testthat)       # To perform tests

2 Read source data

Create a data frame occurrences from the source data:

occurrences <- readxl::read_excel(
  path = here("data", "raw", "rivierkreeften_ANB september 2022.xlsx")) %>%
  mutate(datum = as.character(.data$datum),
         x = as.character(x))

Preview data:

occurrences %>% head(n = 5)

3 Process source data

Define vector with the IAS crayfishes of interest:

ias_crayfishes <- c(
  "Pontastacus leptodactylus", 
  "Pacifastacus leniusculus", 
  "Faxonius limosus",
  "Faxonius rusticus",
  "Procambarus clarkii",
  "Procambarus acutus",
  "Procambarus fallax"
)

3.1 Tidy data

Pivot data to convert the columns Pontastacus leptodactylus, Pacifastacus leniusculus, Faxonius limosus, Faxonius rusticus, Procambarus clarkii, Procambarus acutus and Procambarus fallax in two columns called species and n:

occurrences <- 
  occurrences %>%
  tidyr::pivot_longer(
    cols = one_of(ias_crayfishes), 
    names_to = "species", 
    values_to = "n"
  )

Set n to 0 if NA:

occurrences <- 
  occurrences %>%
  mutate(n = if_else(is.na(.data$n), 0, .data$n))

Example tidy transformation:

occurrences %>% 
  select(locatie, species, n) %>%
  head(20)

3.2 Improve location identifiers

Some location identifiers are just numbers. Notice that they all have a non empty description (column omschrijving):

occurrences %>%
  filter(!grepl("\\D", locatie)) %>%
  distinct(locatie, omschrijving)

Create column location composed of the first capital letter of each word in omschrijving and the number in locatie:

occurrences <-
  occurrences %>%
  rowwise() %>%
  mutate(
    location = if_else(
      !grepl("\\D", locatie),
      paste0(
        paste0(unlist(stringr::str_extract_all(omschrijving, 
                                              pattern = "[A-Z]")),
               collapse = ""),
        locatie, collapse = ""),
      locatie
    )) %>%
  ungroup()

Improved locations:

occurrences %>%
  filter(!grepl("\\D", locatie)) %>%
  distinct(locatie, omschrijving, location)

Other locations should not be changed. Example:

occurrences %>%
  filter(grepl("\\D", locatie)) %>% 
  distinct(locatie, omschrijving, location) %>%
  head(10)

3.3 Clean geographical coordinates

Some rows contain suspicious coordinates:

occurrences %>%
  filter(!grepl("\\D", locatie)) %>%
  distinct(x, y)

The decimal separator (.), if missing, has to be set after six digits from left:

# add dot after six digits
occurrences$x_improved <- gsub("^(.{6})(.*)$",
                      "\\1\\.\\2",
                      occurrences$x)
occurrences$y_improved <- gsub("^(.{6})(.*)$",
                      "\\1\\.\\2",
                      occurrences$y)
# remove duplicate dots
occurrences$x_improved <- gsub("(\\.)\\1+", "\\1", occurrences$x_improved)
occurrences$y_improved <- gsub("(\\.)\\1+", "\\1", occurrences$y_improved)
# remove dot at the end
occurrences$x_improved <- gsub("\\.$", "", occurrences$x_improved)
occurrences$y_improved <- gsub("\\.$", "", occurrences$y_improved)

See changes:

occurrences %>% distinct(x, y, x_improved, y_improved)

3.4 Change coordinate system

Data are provided in Belgian Lambert72 (EPSG 31370) coordinates. DwC accepts data in WGS84 (EPSG 4326) only.

occurrences <- 
  occurrences %>%
  # remove x and y as they contain errors
  select(-c(x, y)) %>%
  # create duplicate of x_improved and y_improved using lambert as suffix
  mutate(x_lambert = x_improved,
         y_lambert = y_improved) %>%
  # transform to numeric
  mutate(across(ends_with("improved"), as.numeric)) %>%
  # transform to a geospatial dataframe
  st_as_sf(crs = st_crs(31370), coords = c("x_improved", "y_improved")) %>%
  # transform corodinate reference system
  st_transform(crs = 4326)
# retrieve coordinates from geospatial dataframe
coords <- as_tibble(st_coordinates(occurrences))
# convert back to standard data.frame attaching new coordinates
occurrences <- 
  as_tibble(occurrences) %>% 
  bind_cols(coords) %>%
  select(-.data$geometry)

Preview:

occurrences %>% head(5)

3.5 Generate hashes for species names

We generate a hash based on the species name as saved in column species. This is needed to create a unique occurrenceID of the form eventID:hash where eventID will be later defined during the mapping. As long as the species name doesn’t change, the hash and so the occurrenceID will be stable:

vdigest <- Vectorize(digest)
# Generate hashes
occurrences <-
  occurrences %>% 
  mutate(species_name_hash = vdigest(.data$species, algo = "md5"))

Preview:

occurrences %>%
  select(starts_with("species")) %>%
  distinct()

4 Create database

Create a SQLite database with the source data, so it can be queried with SQL in the next steps:

message("Create in memory SQLite database...")
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
# import occurrences
DBI::dbWriteTable(con, "occurrences", occurrences)
message("DONE")

5 Darwin Core mapping

Create Event core:

dwc_event_sql <- glue::glue_sql(
  readr::read_file(here::here("sql", "dwc_event.sql")),
  .con = con
)
dwc_event <- DBI::dbGetQuery(con, dwc_event_sql)

Create Occurrence extension:

message("Map occurrences to DwC...")
dwc_occurrence_sql <- glue::glue_sql(
  readr::read_file(here::here("sql", "dwc_occurrence.sql")), 
  .con = con
)
dwc_occurrence <- DBI::dbGetQuery(con, dwc_occurrence_sql)
message("DONE")

Close connection to sqlite database:

DBI::dbDisconnect(con)

6 Save data to CSV

write_csv(dwc_event, here::here("data", "processed", "event.csv"), na = "")
write_csv(dwc_occurrence, here::here("data", "processed", "occurrence.csv"),
          na = ""
)

7 Test output

Load tests and run them to validate the DwC mapping:

source(here("tests", "test_dwc_event_occurrence.R"))
## Rows: 256 Columns: 21
## ── Column specification ──────
## Delimiter: ","
## chr  (15): type, language, license, rightsHolder, datasetID, institutionCode...
## dbl   (5): decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters,...
## date  (1): eventDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1792 Columns: 9
## ── Column specification ──────
## Delimiter: ","
## chr (8): eventID, basisOfRecord, occurrenceID, occurrenceStatus, occurrenceR...
## dbl (1): individualCount
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Test passed 😀
## Test passed 🎉
## Test passed 🎉
## Test passed 😸
## Test passed 😀
## Test passed 🥳
## Test passed 🥇
## Test passed 🌈
## Test passed 😸
## Test passed 🎉
## Test passed 🎊
## Test passed 🌈
## Test passed 🥳
## Test passed 😸
## Test passed 😸
## Test passed 🥇
## Test passed 🌈
## Test passed 🌈
## Test passed 🎊