Install required libraries (only if the libraries have not been installed before):
installed <- rownames(installed.packages())
required <- c(
"tidyverse",
"tidylog",
"magrittr",
"here",
"janitor",
"digest"
)
if (!all(required %in% installed)) {
install.packages(required[!required %in% installed])
}
Load libraries:
library(tidyverse) # To do data science
library(tidylog) # To provide feedback on dplyr functions
library(magrittr) # To use %<>% pipes
library(here) # To find files
library(janitor) # To clean input data
library(digest) # To generate hashes
Create a data frame input_data
from the source data:
input_data <- read_tsv(
file = here("data", "raw", "list_of_invasive_alien_species_of_union_concern.tsv"),
show_col_types = FALSE,
col_types = cols(entry_into_force = col_date(format = "%d %B %Y")),
locale = locale(date_names = "en")
)
Preview data:
input_data %>% head(n = 5)
The column scientific_name_synonym
contains synonym
names. We’ll create new rows for the values in this column, keeping
their association with the original scientific name
(accepted_scientific_name
):
input_data %<>%
mutate(scientific_name_accepted = scientific_name) %>%
pivot_longer(
c("scientific_name", "scientific_name_synonym"),
names_to = "col_name", # help column
values_to = "scientific_name"
) %>%
select(-col_name) %>%
relocate(scientific_name) %>%
filter(!is.na(scientific_name))
## mutate: new variable 'scientific_name_accepted' (character) with 88 unique values and 0% NA
## pivot_longer: reorganized (scientific_name_synonym) into (col_name) [was 88x9, now 176x9]
## select: dropped one variable (col_name)
## relocate: columns reordered (scientific_name, english_name, dutch_name, french_name, german_name, …)
## filter: removed 78 rows (44%), 98 rows remaining
Preview result of this operation:
input_data %>% head()
To link taxa with information in the extension(s), each taxon needs a
unique and relatively stable taxonID
. Here we create one in
the form of dataset_shortname:taxon:hash
, where
hash
is unique code based on scientific name and kingdom
(that will remain the same as long as scientific name and kingdom remain
the same):
vdigest <- Vectorize(digest) # Vectorize digest function to work with vectors
input_data %<>% mutate(taxon_id = paste(
"union-list",
"taxon",
vdigest(paste(scientific_name, kingdom), algo = "md5"),
sep = ":"
))
## mutate: new variable 'taxon_id' (character) with 98 unique values and 0% NA
Join on scientific_name_accepted
to get the
taxon_id_accepted
:
input_data %<>%
left_join(
select(input_data, "scientific_name", "taxon_id"),
by = c("scientific_name_accepted" = "scientific_name"),
suffix = c("", "_accepted")
)
## select: dropped 7 variables (english_name, dutch_name, french_name, german_name, entry_into_force, …)
## left_join: added one column (taxon_id_accepted)
## > rows only in x 0
## > rows only in y (10)
## > matched rows 98
## > ====
## > rows total 98
Show the number of taxa and distributions per kingdom:
input_data %>%
group_by(kingdom) %>%
summarize(
`# taxa` = n_distinct(taxon_id),
`# rows` = n()
) %>%
adorn_totals("row")
## group_by: one grouping variable (kingdom)
## summarize: now 3 rows and 3 columns, ungrouped
Preview data:
input_data %>% head()
Create a taxon
dataframe starting from
input_data
:
taxon <- input_data
Map the data to Darwin Core Taxon.
Start with record-level terms which contain metadata about the dataset (which is generally the same for all records).
taxon %<>% mutate(dwc_language = "en")
## mutate: new variable 'dwc_language' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_license = "http://creativecommons.org/publicdomain/zero/1.0/")
## mutate: new variable 'dwc_license' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_rightsHolder = "INBO")
## mutate: new variable 'dwc_rightsHolder' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_institutionCode = "INBO")
## mutate: new variable 'dwc_institutionCode' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_datasetID = "https://doi.org/10.15468/97aucj")
## mutate: new variable 'dwc_datasetID' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_datasetName = "List of Invasive Alien Species of Union concern")
## mutate: new variable 'dwc_datasetName' (character) with one unique value and 0% NA
The following terms contain information about the taxon:
taxon %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 98 unique values and 0% NA
taxon %<>% mutate(dwc_acceptedNameUsageID = taxon_id_accepted)
## mutate: new variable 'dwc_acceptedNameUsageID' (character) with 88 unique values and 0% NA
taxon %<>% mutate(dwc_scientificName = scientific_name)
## mutate: new variable 'dwc_scientificName' (character) with 98 unique values and 0% NA
taxon %<>% mutate(dwc_acceptedNameUsage = scientific_name_accepted)
## mutate: new variable 'dwc_acceptedNameUsage' (character) with 88 unique values and 0% NA
Inspect values:
taxon %>%
group_by(kingdom) %>%
count()
## group_by: one grouping variable (kingdom)
## count: now 3 rows and 2 columns, one group variable remaining (kingdom)
Map values:
taxon %<>% mutate(dwc_kingdom = kingdom)
## mutate: new variable 'dwc_kingdom' (character) with 3 unique values and 0% NA
Map values by recoding to the GBIF rank vocabulary:
taxon %<>% mutate(dwc_taxonRank = case_when(
# Scientific names often end with authors, so regex only considers beginning
scientific_name == "Lespedeza juncea var. sericea (Thunb.) Lace & Hauech" ~ "variety",
scientific_name == "Procambarus fallax (Hagen, 1870) f. virginalis" ~ "form",
scientific_name == "Pueraria montana (Lour.) Merr. var. lobata (Willd.)" ~ "variety",
scientific_name == "Vespa velutina nigrithorax de Buysson, 1905" ~ "subspecies",
str_detect(scientific_name, "^[A-Z][a-z]+ [a-z]+") ~ "species"
))
## mutate: new variable 'dwc_taxonRank' (character) with 4 unique values and 0% NA
All taxa should have a rank:
taxon %>%
filter(is.na(dwc_taxonRank)) %>%
nrow() == 0
## filter: removed all rows (100%)
## [1] TRUE
Show mapped values:
taxon %>% select(dwc_scientificName, dwc_taxonRank)
## select: dropped 20 variables (scientific_name, english_name, dutch_name, french_name, german_name, …)
taxon %<>% mutate(dwc_taxonomicStatus = case_when(
taxon_id == taxon_id_accepted ~ "accepted",
TRUE ~ "synonym"
))
## mutate: new variable 'dwc_taxonomicStatus' (character) with 2 unique values and 0% NA
Only keep the Darwin Core columns:
taxon %<>% select(starts_with("dwc_"))
## select: dropped 10 variables (scientific_name, english_name, dutch_name, french_name, german_name, …)
Drop the dwc_
prefix:
colnames(taxon) <- str_remove(colnames(taxon), "dwc_")
Preview data:
taxon %>% head()
Save to CSV:
write_csv(taxon, here("data", "processed", "taxon.csv"), na = "")
Create a vernacular
dataframe starting from
input_data
:
vernacular <- input_data
vernacular <- vernacular %>%
pivot_longer(
c("english_name", "dutch_name", "french_name", "german_name"),
names_to = "language",
values_to = "vernacular_name") %>%
mutate(language = case_match(language,
"english_name" ~ "en",
"dutch_name" ~ "nl",
"french_name" ~ "fr",
"german_name" ~"de"))
## pivot_longer: reorganized (english_name, dutch_name, french_name, german_name) into (language, vernacular_name) [was 98x10, now 392x8]
## mutate: changed 392 values (100%) of 'language' (0 new NA)
Preview:
vernacular %>% head(10)
Some taxa have two vernacular names of the same language:
vernacular %>%
filter(stringr::str_detect(
string = .$vernacular_name,
pattern = ";")) %>%
relocate(vernacular_name, language)
## filter: removed 386 rows (98%), 6 rows remaining
## relocate: columns reordered (vernacular_name, language, scientific_name, entry_into_force, kingdom, …)
vernacular <- vernacular %>%
separate_longer_delim(vernacular_name, delim = ";")
Preview:
vernacular %>%
filter(vernacular_name %in% c("gewone gunnera",
"reuzenrabarber",
"Salvinia moss",
"kariba weed")) %>%
distinct(vernacular_name, language)
## filter: removed 392 rows (98%), 6 rows remaining
## distinct: removed 2 rows (33%), 4 rows remaining
Map the data to Vernacular Names.
vernacular %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 98 unique values and 0% NA
vernacular %<>% mutate(dwc_vernacularName = vernacular_name)
## mutate: new variable 'dwc_vernacularName' (character) with 354 unique values and <1% NA
vernacular %<>% mutate(dwc_language = language)
## mutate: new variable 'dwc_language' (character) with 4 unique values and 0% NA
Only keep the Darwin Core columns:
vernacular %<>% select(starts_with("dwc_"))
## select: dropped 8 variables (scientific_name, entry_into_force, kingdom, scientific_name_accepted, taxon_id, …)
Drop the dwc_
prefix:
colnames(vernacular) <- str_remove(colnames(vernacular), "dwc_")
Preview data:
vernacular %>% head()
Save to CSV:
write_csv(vernacular, here("data", "processed", "vernacularname.csv"), na = "")
In the description extension we want to include the date a species has been added to the list.
Create a description
dataframe starting from
input_data
:
description <- input_data
The column entry_into_force
contains the date each
species has been added to the list. We use it to create a description
column:
description %<>% mutate(description = entry_into_force)
## mutate: new variable 'description' (Date) with 8 unique values and 0% NA
Create a type
field to indicate the type of
description:
description %<>% mutate(type = "entry into force")
## mutate: new variable 'type' (character) with one unique value and 0% NA
Map the data to Taxon Description:
description %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 98 unique values and 0% NA
description %<>% mutate(dwc_description = description)
## mutate: new variable 'dwc_description' (Date) with 8 unique values and 0% NA
description %<>% mutate(dwc_type = type)
## mutate: new variable 'dwc_type' (character) with one unique value and 0% NA
description %<>% mutate(dwc_language = "en")
## mutate: new variable 'dwc_language' (character) with one unique value and 0% NA
Only keep the Darwin Core columns:
description %<>% select(starts_with("dwc_"))
## select: dropped 12 variables (scientific_name, english_name, dutch_name, french_name, german_name, …)
Drop the dwc_
prefix:
colnames(description) <- str_remove(colnames(description), "dwc_")
Preview description:
description %>% head()
Save to CSV:
write_csv(description, here("data", "processed", "description.csv"), na = "")