1 Setup

Install required libraries (only if the libraries have not been installed before):

installed <- rownames(installed.packages())
required <- c(
  "tidyverse",
  "tidylog",
  "magrittr",
  "here",
  "janitor",
  "digest",
  "stringr"
)
if (!all(required %in% installed)) {
  install.packages(required[!required %in% installed])
}

Load libraries:

library(tidyverse)      # To do data science
library(tidylog)        # To provide feedback on dplyr functions
library(magrittr)       # To use %<>% pipes
library(here)           # To find files
library(janitor)        # To clean input data
library(digest)         # To generate hashes
library(stringr)        # To work with strings

2 Read source data

Create a data frame input_data from the source data:

input_data <- read_tsv(
  file = here("data", "raw", "input_taxa.tsv"),
  show_col_types = FALSE
) 

Preview data:

input_data %>% head(n = 5)

3 Process source data

3.1 Improve matching to GBIF Backbone

Replace Hydrocotyle ranunculoides with Hydrocotyle ranunculoides L.fil. to improve the match to GBIF Backbone (see issue #17):

input_data <- 
  input_data %>%
  mutate(scientific_name = ifelse(
    .data$scientific_name == "Hydrocotyle ranunculoides",
    "Hydrocotyle ranunculoides L.fil.",
    .data$scientific_name))
## mutate: changed one value (3%) of 'scientific_name' (0 new NA)

For the same reason, we add the authorship to Zizania latifolia (for more info, see issue #4718 in the Github’s GBIF portal feedback repository):

input_data <- 
  input_data %>%
  mutate(scientific_name = ifelse(
    .data$scientific_name == "Zizania latifolia",
    "Zizania latifolia (Griseb.) Hance ex F.Muell.",
    .data$scientific_name))
## mutate: changed one value (3%) of 'scientific_name' (0 new NA)

3.2 Tidy data

We reshape the vernacular names columns by creating two columns: vernacular_name and language:

input_data %<>% pivot_longer(
  cols= starts_with("vernacular"),
  names_to = c("language"), 
  names_prefix = "vernacular_name_",
  values_to = "vernacular_name"
)
## pivot_longer: reorganized (vernacular_name_en, vernacular_name_nl, vernacular_name_fr) into (language, vernacular_name) [was 30x11, now 90x10]

Preview:

input_data %>% head(10)

Some taxa have multiple vernacular names divided by " / ":

input_data %>% filter(
  str_detect(.data$vernacular_name, pattern = " / ")
)
## filter: removed 86 rows (96%), 4 rows remaining

We split them in separate rows:

input_data %<>% separate_rows(vernacular_name, sep = " / ")

The input data are now tidy:

input_data

3.3 Taxon IDs

To link taxa with information in the extension(s), each taxon needs a unique and relatively stable taxonID. Here we create one in the form of dataset_shortname:taxon:hash, where hash is unique code based on scientific name and kingdom (that will remain the same as long as scientific name and kingdom remain the same):

vdigest <- Vectorize(digest) # Vectorize digest function to work with vectors
input_data %<>% mutate(taxon_id = paste(
  "riparias-target-list",
  "taxon",
  vdigest(paste(scientific_name, kingdom), algo = "md5"),
  sep = ":"
))
## mutate: new variable 'taxon_id' (character) with 30 unique values and 0% NA

3.4 Preview data

Show the number of taxa and distributions per kingdom:

input_data %>%
  group_by(kingdom) %>%
  summarize(
    `# taxa` = n_distinct(taxon_id),
    `# rows` = n()
  ) %>%
  adorn_totals("row")
## group_by: one grouping variable (kingdom)
## summarize: now 2 rows and 3 columns, ungrouped

Preview data:

input_data %>% head()

4 Taxon core

4.1 Pre-processing

Create a dataframe with unique taxa only (ignoring eventual multiple distribution rows):

taxon <- input_data %>% distinct(taxon_id, .keep_all = TRUE)
## distinct: removed 64 rows (68%), 30 rows remaining

4.2 Term mapping

Map the data to Darwin Core Taxon.

Start with record-level terms which contain metadata about the dataset (which is generally the same for all records).

4.2.1 language

taxon %<>% mutate(dwc_language = "en")
## mutate: new variable 'dwc_language' (character) with one unique value and 0% NA

4.2.2 license

taxon %<>% mutate(dwc_license = "https://creativecommons.org/publicdomain/zero/1.0/")
## mutate: new variable 'dwc_license' (character) with one unique value and 0% NA

4.2.3 rightsHolder

taxon %<>% mutate(dwc_rightsHolder = "INBO")
## mutate: new variable 'dwc_rightsHolder' (character) with one unique value and 0% NA

4.2.4 datasetID

taxon %<>% mutate(dwc_datasetID = "https://doi.org/10.15468/p4ugqr")
## mutate: new variable 'dwc_datasetID' (character) with one unique value and 0% NA

4.2.5 institutionCode

taxon %<>% mutate(dwc_institutionCode = "INBO")
## mutate: new variable 'dwc_institutionCode' (character) with one unique value and 0% NA

4.2.6 datasetName

taxon %<>% mutate(dwc_datasetName = "Checklist of LIFE RIPARIAS target species")
## mutate: new variable 'dwc_datasetName' (character) with one unique value and 0% NA

The following terms contain information about the taxon:

4.2.7 taxonID

taxon %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 30 unique values and 0% NA

4.2.8 scientificName

taxon %<>% mutate(dwc_scientificName = scientific_name)
## mutate: new variable 'dwc_scientificName' (character) with 30 unique values and 0% NA

4.2.9 kingdom

Inspect values:

taxon %>%
  group_by(kingdom) %>%
  count()
## group_by: one grouping variable (kingdom)
## count: now 2 rows and 2 columns, one group variable remaining (kingdom)

Map values:

taxon %<>% mutate(dwc_kingdom = kingdom)
## mutate: new variable 'dwc_kingdom' (character) with 2 unique values and 0% NA

4.2.10 taxonRank

Map values by recoding to the GBIF rank vocabulary:

taxon %<>% mutate(dwc_taxonRank = case_when(
  str_detect(dwc_scientificName, "^[A-Z][a-z]+ [a-z]+$") ~ "species",
  # More specific mappings can be added here
  str_detect(dwc_scientificName, "Hydrocotyle ranunculoides L.fil.") ~ "species",
  str_detect(dwc_scientificName, "Zizania latifolia \\(Griseb.\\) Hance ex F.Muell.") ~ "species")
)
## mutate: new variable 'dwc_taxonRank' (character) with one unique value and 0% NA

Show unmapped values:

taxon %>%
  filter(is.na(dwc_taxonRank)) %>%
  select(dwc_scientificName)
## filter: removed all rows (100%)
## select: dropped 20 variables (scientific_name, gbif_key, kingdom, is_project_species, is_alert_species, …)

4.3 Post-processing

Only keep the Darwin Core columns:

taxon %<>% select(starts_with("dwc_"))
## select: dropped 11 variables (scientific_name, gbif_key, kingdom, is_project_species, is_alert_species, …)

Drop the dwc_ prefix:

colnames(taxon) <- str_remove(colnames(taxon), "dwc_")

Preview data:

taxon %>% head()

Save to CSV:

write_csv(taxon, here("data", "processed", "taxon.csv"), na = "")

5 Map vernacular names extension

5.1 Pre-processing

Create a vernacular data.frame from input_data containing only vernacular name information and corresponding taxon IDs:

vernacular <-
  input_data %>%
  select(
    taxon_id,
    language,
    vernacular_name
  )
## select: dropped 8 variables (scientific_name, gbif_key, kingdom, is_project_species, is_alert_species, …)
vernacular

Remove rows with missing vernacular name:

vernacular %<>% filter(!is.na(vernacular_name))
## filter: no rows removed

5.2 Term mapping

Map the data to Vernacular Names.

5.2.1 TaxonID

vernacular %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 30 unique values and 0% NA

5.2.2 vernacularName

vernacular %<>% mutate(dwc_vernacularName = vernacular_name)
## mutate: new variable 'dwc_vernacularName' (character) with 94 unique values and 0% NA

5.2.3 language

vernacular %<>% mutate(dwc_language = language)
## mutate: new variable 'dwc_language' (character) with 3 unique values and 0% NA

5.3 Post-processing

Only keep the Darwin Core columns:

vernacular %<>% select(starts_with("dwc_"))
## select: dropped 3 variables (taxon_id, language, vernacular_name)

Drop the dwc_ prefix:

colnames(vernacular) <- str_remove(colnames(vernacular), "dwc_")

Preview data:

vernacular %>% head()

Save to CSV:

write_csv(vernacular, here("data", "processed", "vernacularname.csv"), na = "")