1 Setup

Install required libraries (only if the libraries have not been installed before):

installed <- rownames(installed.packages())
required <- c("tidyverse",
              "tidylog",
              "here",
              "readxl",
              "sf",
              "DBI",
              "RSQLite",
              "digest",
              "testthat"
)
if (!all(required %in% installed)) {
  install.packages(required[!required %in% installed])
}

Load libraries:

library(tidyverse)      # To do data science
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
library(tidylog)        # To provide feedback on dplyr functions
library(here)           # To find files
library(readxl)         # To read Excel files
library(sf)             # To convert coordinate systems
## Warning: package 'sf' was built under R version 4.2.2
library(DBI)            # To work with databases
library(RSQLite)        # To work with SQLite databases in R
## Warning: package 'RSQLite' was built under R version 4.2.2
library(digest)         # To create hashes
## Warning: package 'digest' was built under R version 4.2.2

2 Read source data

Create a data frame occurrences from the source data:

occurrences <- readxl::read_excel(
  path = here("data", "raw", "Plantenmonitoring RIPARIAS (Vlaanderen).xlsx"),
  sheet = "Data_inventarisaties_opgekuist",
  col_types = "text")

Preview data:

occurrences %>% head(n = 10)

3 Process source data

3.1 Clean data

Abundance values:

occurrences %>% count(abundantie)

The value O / Occasional / >5%O / Occasional / >5% is of course a typo. We improve it:

occurrences$abundantie <-  recode(
  occurrences$abundantie,
  "O / Occasional / >5%O / Occasional / >5%" = "O / Occasional / >5%"
)

Check improved values:

occurrences %>% count(abundantie)

3.2 Add scientific name where missing

Column soort contains vernacular names in Dutch, sometimes with the scientific name as prefix:

occurrences %>%
  distinct(soort)

We add the scientific name where missing: - roze waterlelie: Nymphaea Γ—marliacea - brede waterpest: Elodea canadensis - smalle waterpest: Elodea nuttallii

occurrences <-
  occurrences %>%
  mutate(soort = recode(
    soort,
    "roze waterlelie" = "roze waterlelie / Nymphaea Γ—marliacea",
    "brede waterpest" = "brede waterpest / Elodea canadensis",
    "smalle waterpest" = "smalle waterpest / Elodea nuttallii"
  )
)

Show distinct values of column soort:

occurrences %>% 
  distinct(soort)

3.3 Tidy data

Extract vernacular names from column soort:

occurrences <- 
  occurrences %>%
  mutate(
    vernacularName = stringr::str_extract(
      string = .data$soort,
      pattern = "(.*?)(?=\\/)")) %>%
  mutate(vernacularName = stringr::str_trim(.data$vernacularName))

Extract scientific names from column soort:

occurrences <- 
  occurrences %>%
  mutate(
    scientificName = stringr::str_extract(
      string = .data$soort,
      pattern = "(?<=\\/)(.*)")) %>%
  mutate(scientificName = stringr::str_trim(scientificName))

Mapped values:

occurrences %>% distinct(scientificName, vernacularName)

4 Create database

Create a SQLite database with the source data, so it can be queried with SQL in the next steps:

message("Create in memory SQLite database...")
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
# import occurrences
DBI::dbWriteTable(con, "occurrences", occurrences)
message("DONE")

5 Darwin Core mapping

Create Event core:

dwc_event_sql <- glue::glue_sql(
  readr::read_file(here::here("sql", "dwc_event.sql")),
  .con = con
)
dwc_event <- DBI::dbGetQuery(con, dwc_event_sql)

Create Occurrence extension:

message("Map occurrences to DwC...")
dwc_occurrence_sql <- glue::glue_sql(
  readr::read_file(here::here("sql", "dwc_occurrence.sql")), 
  .con = con
)
dwc_occurrence <- DBI::dbGetQuery(con, dwc_occurrence_sql)
message("DONE")

Create Measurement Or Facts extension:

dwc_mof_sql <- glue::glue_sql(
  readr::read_file(here::here("sql", "dwc_mof.sql")), 
  .con = con
)
dwc_mof <- DBI::dbGetQuery(con, dwc_mof_sql)

Disconnect from SQLite database:

DBI::dbDisconnect(con)

6 Save data to CSV

write_csv(
  x = dwc_event,
  file = here::here("data", "processed", "event.csv"),
  na = ""
)
write_csv(
  x = dwc_occurrence, 
  file = here::here("data", "processed", "occurrence.csv"),
  na = ""
)
write_csv(
  x = dwc_mof, 
  file = here::here("data", "processed", "mof.csv"),
  na = ""
)

7 Test output

Load tests and run them to validate the DwC mapping:

source(here::here("tests", "test_dwc_event_occurrence_mof.R"))
## Warning: package 'testthat' was built under R version 4.2.2
## Test passed 😸
## Test passed πŸŽ‰
## Test passed πŸŽ‰
## Test passed 🎊
## Test passed πŸŽ‰
## Test passed πŸ˜€
## Test passed πŸ˜€
## Test passed πŸŽ‰
## Test passed 😸
## Test passed πŸ₯‡
## Test passed πŸŽ‰
## Test passed πŸ₯³
## Test passed 😸
## Test passed 😸
## Test passed πŸ₯³
## Test passed πŸ₯³
## Test passed πŸ₯³
## Test passed πŸŽ‰
## Test passed πŸ₯³
## Test passed 🎊
## Test passed πŸ₯³
## Test passed πŸŽ‰
## Test passed 😸
## Test passed 🎊
## Test passed πŸ˜€
## Test passed πŸŽ‰
## Test passed πŸ˜€
## Test passed 😸
## Test passed πŸ₯‡
## Test passed πŸ₯‡
## Test passed πŸ˜€
## Test passed πŸŽ‰
## Test passed πŸŽ‰
## Test passed 🌈
## Test passed πŸ˜€