1 Setup

Install required libraries (only if the libraries have not been installed before):

installed <- rownames(installed.packages())
required <- c(
  "tidyverse",
  "tidylog",
  "magrittr",
  "here",
  "janitor",
  "digest"
)
if (!all(required %in% installed)) {
  install.packages(required[!required %in% installed])
}

Load libraries:

library(tidyverse)      # To do data science
library(tidylog)        # To provide feedback on dplyr functions
library(magrittr)       # To use %<>% pipes
library(here)           # To find files
library(janitor)        # To clean input data
library(digest)         # To generate hashes

2 Read source data

Create a data frame input_data from the source data:

input_data <- read_tsv(
  file = here("data", "raw", "list_of_invasive_alien_species_of_union_concern.tsv"),
  show_col_types = FALSE,
  col_types = cols(entry_into_force = col_date(format = "%d %B %Y")),
  locale = locale(date_names = "en")
)

Preview data:

input_data %>% head(n = 5)

3 Process source data

3.1 Tidy data

The column scientific_name_synonym contains synonym names. We’ll create new rows for the values in this column, keeping their association with the original scientific name (accepted_scientific_name):

input_data %<>%
  mutate(scientific_name_accepted = scientific_name) %>%
  pivot_longer(
    c("scientific_name", "scientific_name_synonym"),
    names_to = "col_name", # help column
    values_to = "scientific_name"
  ) %>%
  select(-col_name) %>%
  relocate(scientific_name) %>%
  filter(!is.na(scientific_name))
## mutate: new variable 'scientific_name_accepted' (character) with 88 unique values and 0% NA
## pivot_longer: reorganized (scientific_name_synonym) into (col_name) [was 88x9, now 176x9]
## select: dropped one variable (col_name)
## relocate: columns reordered (scientific_name, english_name, dutch_name, french_name, german_name, …)
## filter: removed 78 rows (44%), 98 rows remaining

Preview result of this operation:

input_data %>% head()

3.2 Taxon IDs

To link taxa with information in the extension(s), each taxon needs a unique and relatively stable taxonID. Here we create one in the form of dataset_shortname:taxon:hash, where hash is unique code based on scientific name and kingdom (that will remain the same as long as scientific name and kingdom remain the same):

vdigest <- Vectorize(digest) # Vectorize digest function to work with vectors
input_data %<>% mutate(taxon_id = paste(
  "union-list",
  "taxon",
  vdigest(paste(scientific_name, kingdom), algo = "md5"),
  sep = ":"
))
## mutate: new variable 'taxon_id' (character) with 98 unique values and 0% NA

Join on scientific_name_accepted to get the taxon_id_accepted:

input_data %<>%
  left_join(
    select(input_data, "scientific_name", "taxon_id"),
    by = c("scientific_name_accepted" = "scientific_name"),
    suffix = c("", "_accepted")
  )
## select: dropped 7 variables (english_name, dutch_name, french_name, german_name, entry_into_force, …)
## left_join: added one column (taxon_id_accepted)
##            > rows only in x    0
##            > rows only in y  (10)
##            > matched rows     98
##            >                 ====
##            > rows total       98

3.3 Preview data

Show the number of taxa and distributions per kingdom:

input_data %>%
  group_by(kingdom) %>%
  summarize(
    `# taxa` = n_distinct(taxon_id),
    `# rows` = n()
  ) %>%
  adorn_totals("row")
## group_by: one grouping variable (kingdom)
## summarize: now 3 rows and 3 columns, ungrouped

Preview data:

input_data %>% head()

4 Taxon core

Create a taxon dataframe starting from input_data:

taxon <- input_data

4.1 Term mapping

Map the data to Darwin Core Taxon.

Start with record-level terms which contain metadata about the dataset (which is generally the same for all records).

4.1.1 language

taxon %<>% mutate(dwc_language = "en")
## mutate: new variable 'dwc_language' (character) with one unique value and 0% NA

4.1.2 license

taxon %<>% mutate(dwc_license = "http://creativecommons.org/publicdomain/zero/1.0/")
## mutate: new variable 'dwc_license' (character) with one unique value and 0% NA

4.1.3 rightsHolder

taxon %<>% mutate(dwc_rightsHolder = "INBO")
## mutate: new variable 'dwc_rightsHolder' (character) with one unique value and 0% NA

4.1.4 institutionCode

taxon %<>% mutate(dwc_institutionCode = "INBO")
## mutate: new variable 'dwc_institutionCode' (character) with one unique value and 0% NA

4.1.5 datasetID

taxon %<>% mutate(dwc_datasetID = "https://doi.org/10.15468/97aucj")
## mutate: new variable 'dwc_datasetID' (character) with one unique value and 0% NA

4.1.6 datasetName

taxon %<>% mutate(dwc_datasetName = "List of Invasive Alien Species of Union concern")
## mutate: new variable 'dwc_datasetName' (character) with one unique value and 0% NA

The following terms contain information about the taxon:

4.1.7 taxonID

taxon %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 98 unique values and 0% NA

4.1.8 acceptedNameUsageID

taxon %<>% mutate(dwc_acceptedNameUsageID = taxon_id_accepted)
## mutate: new variable 'dwc_acceptedNameUsageID' (character) with 88 unique values and 0% NA

4.1.9 scientificName

taxon %<>% mutate(dwc_scientificName = scientific_name)
## mutate: new variable 'dwc_scientificName' (character) with 98 unique values and 0% NA

4.1.10 acceptedNameUsage

taxon %<>% mutate(dwc_acceptedNameUsage = scientific_name_accepted)
## mutate: new variable 'dwc_acceptedNameUsage' (character) with 88 unique values and 0% NA

4.1.11 kingdom

Inspect values:

taxon %>%
  group_by(kingdom) %>%
  count()
## group_by: one grouping variable (kingdom)
## count: now 3 rows and 2 columns, one group variable remaining (kingdom)

Map values:

taxon %<>% mutate(dwc_kingdom = kingdom)
## mutate: new variable 'dwc_kingdom' (character) with 3 unique values and 0% NA

4.1.12 taxonRank

Map values by recoding to the GBIF rank vocabulary:

taxon %<>% mutate(dwc_taxonRank = case_when(
  # Scientific names often end with authors, so regex only considers beginning
  scientific_name == "Lespedeza juncea var. sericea (Thunb.) Lace & Hauech" ~ "variety",
  scientific_name == "Procambarus fallax (Hagen, 1870) f. virginalis" ~ "form",
  scientific_name == "Pueraria montana (Lour.) Merr. var. lobata (Willd.)" ~ "variety",
  scientific_name == "Vespa velutina nigrithorax de Buysson, 1905" ~ "subspecies",
  str_detect(scientific_name, "^[A-Z][a-z]+ [a-z]+") ~ "species"
))
## mutate: new variable 'dwc_taxonRank' (character) with 4 unique values and 0% NA

All taxa should have a rank:

taxon %>%
  filter(is.na(dwc_taxonRank)) %>%
  nrow() == 0
## filter: removed all rows (100%)
## [1] TRUE

Show mapped values:

taxon %>% select(dwc_scientificName, dwc_taxonRank)
## select: dropped 20 variables (scientific_name, english_name, dutch_name, french_name, german_name, …)

4.1.13 taxonomicStatus

taxon %<>% mutate(dwc_taxonomicStatus = case_when(
  taxon_id == taxon_id_accepted ~ "accepted",
  TRUE ~ "synonym"
))
## mutate: new variable 'dwc_taxonomicStatus' (character) with 2 unique values and 0% NA

4.2 Post-processing

Only keep the Darwin Core columns:

taxon %<>% select(starts_with("dwc_"))
## select: dropped 10 variables (scientific_name, english_name, dutch_name, french_name, german_name, …)

Drop the dwc_ prefix:

colnames(taxon) <- str_remove(colnames(taxon), "dwc_")

Preview data:

taxon %>% head()

Save to CSV:

write_csv(taxon, here("data", "processed", "taxon.csv"), na = "")

5 Map vernacular names extension

5.1 Pre-processing

Create a vernacular dataframe starting from input_data:

vernacular <- input_data

5.2 Tidy data

vernacular <- vernacular %>%
  pivot_longer(
    c("english_name", "dutch_name", "french_name", "german_name"),
    names_to = "language",
    values_to = "vernacular_name") %>%
  mutate(language = case_match(language,
                               "english_name" ~ "en",
                               "dutch_name" ~ "nl",
                               "french_name" ~ "fr",
                               "german_name" ~"de"))
## pivot_longer: reorganized (english_name, dutch_name, french_name, german_name) into (language, vernacular_name) [was 98x10, now 392x8]
## mutate: changed 392 values (100%) of 'language' (0 new NA)

Preview:

vernacular %>% head(10)

Some taxa have two vernacular names of the same language:

vernacular %>%
  filter(stringr::str_detect(
    string = .$vernacular_name,
    pattern = ";")) %>%
  relocate(vernacular_name, language)
## filter: removed 386 rows (98%), 6 rows remaining
## relocate: columns reordered (vernacular_name, language, scientific_name, entry_into_force, kingdom, …)
vernacular <- vernacular %>%
  separate_longer_delim(vernacular_name, delim = ";")

Preview:

vernacular %>%
  filter(vernacular_name %in% c("gewone gunnera",
                                "reuzenrabarber",
                                "Salvinia moss",
                                "kariba weed")) %>%
  distinct(vernacular_name, language)
## filter: removed 392 rows (98%), 6 rows remaining
## distinct: removed 2 rows (33%), 4 rows remaining

5.3 Term mapping

Map the data to Vernacular Names.

5.3.1 TaxonID

vernacular %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 98 unique values and 0% NA

5.3.2 vernacularName

vernacular %<>% mutate(dwc_vernacularName = vernacular_name)
## mutate: new variable 'dwc_vernacularName' (character) with 354 unique values and <1% NA

5.3.3 language

vernacular %<>% mutate(dwc_language = language)
## mutate: new variable 'dwc_language' (character) with 4 unique values and 0% NA

5.4 Post-processing

Only keep the Darwin Core columns:

vernacular %<>% select(starts_with("dwc_"))
## select: dropped 8 variables (scientific_name, entry_into_force, kingdom, scientific_name_accepted, taxon_id, …)

Drop the dwc_ prefix:

colnames(vernacular) <- str_remove(colnames(vernacular), "dwc_")

Preview data:

vernacular %>% head()

Save to CSV:

write_csv(vernacular, here("data", "processed", "vernacularname.csv"), na = "")

6 Map description extension

In the description extension we want to include the date a species has been added to the list.

6.1 Pre-processing

Create a description dataframe starting from input_data:

description <- input_data

6.2 Entry into force

The column entry_into_force contains the date each species has been added to the list. We use it to create a description column:

description %<>% mutate(description = entry_into_force)
## mutate: new variable 'description' (Date) with 8 unique values and 0% NA

Create a type field to indicate the type of description:

description %<>% mutate(type = "entry into force")
## mutate: new variable 'type' (character) with one unique value and 0% NA

6.3 Term mapping

Map the data to Taxon Description:

6.3.1 taxonID

description %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 98 unique values and 0% NA

6.3.2 description

description %<>% mutate(dwc_description = description)
## mutate: new variable 'dwc_description' (Date) with 8 unique values and 0% NA

6.3.3 type

description %<>% mutate(dwc_type = type)
## mutate: new variable 'dwc_type' (character) with one unique value and 0% NA

6.3.4 language

description %<>% mutate(dwc_language = "en")
## mutate: new variable 'dwc_language' (character) with one unique value and 0% NA

6.4 Post-processing

Only keep the Darwin Core columns:

description %<>% select(starts_with("dwc_"))
## select: dropped 12 variables (scientific_name, english_name, dutch_name, french_name, german_name, …)

Drop the dwc_ prefix:

colnames(description) <- str_remove(colnames(description), "dwc_")

Preview description:

description %>% head()

Save to CSV:

write_csv(description, here("data", "processed", "description.csv"), na = "")