Install required packages (only if the packages have not been installed before):
installed <- rownames(installed.packages())
required <- c("tidyverse",
"tidylog",
"here",
"readxl",
"sf",
"DBI",
"RSQLite",
"digest",
"testthat"
)
if (!all(required %in% installed)) {
install.packages(required[!required %in% installed])
}
Load packages:
library(tidyverse) # To do data science
library(tidylog) # To provide feedback on dplyr functions
library(here) # To find files
library(readxl) # To read Excel files
library(sf) # To convert coordinate systems
library(DBI) # To work with databases
library(RSQLite) # To work with SQLite databases in R
library(digest) # To create hashes
library(testthat) # To perform tests
Create a data frame occurrences
from the source
data:
occurrences <- readxl::read_excel(
path = here("data", "raw", "rivierkreeften_ANB september 2022.xlsx")) %>%
mutate(datum = as.character(.data$datum),
x = as.character(x))
Preview data:
occurrences %>% head(n = 5)
Define vector with the IAS crayfishes of interest:
ias_crayfishes <- c(
"Pontastacus leptodactylus",
"Pacifastacus leniusculus",
"Faxonius limosus",
"Faxonius rusticus",
"Procambarus clarkii",
"Procambarus acutus",
"Procambarus fallax"
)
Pivot data to convert the columns
Pontastacus leptodactylus
,
Pacifastacus leniusculus
, Faxonius limosus
,
Faxonius rusticus
, Procambarus clarkii
,
Procambarus acutus
and Procambarus fallax
in
two columns called species
and n
:
occurrences <-
occurrences %>%
tidyr::pivot_longer(
cols = one_of(ias_crayfishes),
names_to = "species",
values_to = "n"
)
Set n
to 0 if NA
:
occurrences <-
occurrences %>%
mutate(n = if_else(is.na(.data$n), 0, .data$n))
Example tidy transformation:
occurrences %>%
select(locatie, species, n) %>%
head(20)
Some location identifiers are just numbers. Notice that they all have
a non empty description (column omschrijving
):
occurrences %>%
filter(!grepl("\\D", locatie)) %>%
distinct(locatie, omschrijving)
Create column location
composed of the first capital
letter of each word in omschrijving
and the number in
locatie
:
occurrences <-
occurrences %>%
rowwise() %>%
mutate(
location = if_else(
!grepl("\\D", locatie),
paste0(
paste0(unlist(stringr::str_extract_all(omschrijving,
pattern = "[A-Z]")),
collapse = ""),
locatie, collapse = ""),
locatie
)) %>%
ungroup()
Improved locations:
occurrences %>%
filter(!grepl("\\D", locatie)) %>%
distinct(locatie, omschrijving, location)
Other locations should not be changed. Example:
occurrences %>%
filter(grepl("\\D", locatie)) %>%
distinct(locatie, omschrijving, location) %>%
head(10)
Some rows contain suspicious coordinates:
occurrences %>%
filter(!grepl("\\D", locatie)) %>%
distinct(x, y)
The decimal separator (.
), if missing, has to be set
after six digits from left:
# add dot after six digits
occurrences$x_improved <- gsub("^(.{6})(.*)$",
"\\1\\.\\2",
occurrences$x)
occurrences$y_improved <- gsub("^(.{6})(.*)$",
"\\1\\.\\2",
occurrences$y)
# remove duplicate dots
occurrences$x_improved <- gsub("(\\.)\\1+", "\\1", occurrences$x_improved)
occurrences$y_improved <- gsub("(\\.)\\1+", "\\1", occurrences$y_improved)
# remove dot at the end
occurrences$x_improved <- gsub("\\.$", "", occurrences$x_improved)
occurrences$y_improved <- gsub("\\.$", "", occurrences$y_improved)
See changes:
occurrences %>% distinct(x, y, x_improved, y_improved)
Data are provided in Belgian Lambert72 (EPSG 31370) coordinates. DwC accepts data in WGS84 (EPSG 4326) only.
occurrences <-
occurrences %>%
# remove x and y as they contain errors
select(-c(x, y)) %>%
# create duplicate of x_improved and y_improved using lambert as suffix
mutate(x_lambert = x_improved,
y_lambert = y_improved) %>%
# transform to numeric
mutate(across(ends_with("improved"), as.numeric)) %>%
# transform to a geospatial dataframe
st_as_sf(crs = st_crs(31370), coords = c("x_improved", "y_improved")) %>%
# transform corodinate reference system
st_transform(crs = 4326)
# retrieve coordinates from geospatial dataframe
coords <- as_tibble(st_coordinates(occurrences))
# convert back to standard data.frame attaching new coordinates
occurrences <-
as_tibble(occurrences) %>%
bind_cols(coords) %>%
select(-.data$geometry)
Preview:
occurrences %>% head(5)
We generate a hash based on the species name as saved in column
species
. This is needed to create a unique
occurrenceID
of the form eventID:hash
where
eventID
will be later defined during the mapping. As long
as the species name doesn’t change, the hash and so the
occurrenceID
will be stable:
vdigest <- Vectorize(digest)
# Generate hashes
occurrences <-
occurrences %>%
mutate(species_name_hash = vdigest(.data$species, algo = "md5"))
Preview:
occurrences %>%
select(starts_with("species")) %>%
distinct()
Create a SQLite database with the source data, so it can be queried with SQL in the next steps:
message("Create in memory SQLite database...")
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
# import occurrences
DBI::dbWriteTable(con, "occurrences", occurrences)
message("DONE")
Create Event core:
dwc_event_sql <- glue::glue_sql(
readr::read_file(here::here("sql", "dwc_event.sql")),
.con = con
)
dwc_event <- DBI::dbGetQuery(con, dwc_event_sql)
Create Occurrence extension:
message("Map occurrences to DwC...")
dwc_occurrence_sql <- glue::glue_sql(
readr::read_file(here::here("sql", "dwc_occurrence.sql")),
.con = con
)
dwc_occurrence <- DBI::dbGetQuery(con, dwc_occurrence_sql)
message("DONE")
Close connection to sqlite database:
DBI::dbDisconnect(con)
write_csv(dwc_event, here::here("data", "processed", "event.csv"), na = "")
write_csv(dwc_occurrence, here::here("data", "processed", "occurrence.csv"),
na = ""
)
Load tests and run them to validate the DwC mapping:
source(here("tests", "test_dwc_event_occurrence.R"))
## Rows: 256 Columns: 21
## ── Column specification ──────
## Delimiter: ","
## chr (15): type, language, license, rightsHolder, datasetID, institutionCode...
## dbl (5): decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters,...
## date (1): eventDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1792 Columns: 9
## ── Column specification ──────
## Delimiter: ","
## chr (8): eventID, basisOfRecord, occurrenceID, occurrenceStatus, occurrenceR...
## dbl (1): individualCount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Test passed 😀
## Test passed 🎉
## Test passed 🎉
## Test passed 😸
## Test passed 😀
## Test passed 🥳
## Test passed 🥇
## Test passed 🌈
## Test passed 😸
## Test passed 🎉
## Test passed 🎊
## Test passed 🌈
## Test passed 🥳
## Test passed 😸
## Test passed 😸
## Test passed 🥇
## Test passed 🌈
## Test passed 🌈
## Test passed 🎊