Install required libraries (only if the libraries have not been installed before):
installed <- rownames(installed.packages())
required <- c(
"tidyverse",
"tidylog",
"magrittr",
"here",
"janitor",
"digest",
"stringr"
)
if (!all(required %in% installed)) {
install.packages(required[!required %in% installed])
}
Load libraries:
library(tidyverse) # To do data science
library(tidylog) # To provide feedback on dplyr functions
library(magrittr) # To use %<>% pipes
library(here) # To find files
library(janitor) # To clean input data
library(digest) # To generate hashes
library(stringr) # To work with strings
Create a data frame input_data
from the source data:
input_data <- read_tsv(
file = here("data", "raw", "input_taxa.tsv"),
show_col_types = FALSE
)
Preview data:
input_data %>% head(n = 5)
Replace Hydrocotyle ranunculoides with Hydrocotyle ranunculoides L.fil. to improve the match to GBIF Backbone (see issue #17):
input_data <-
input_data %>%
mutate(scientific_name = ifelse(
.data$scientific_name == "Hydrocotyle ranunculoides",
"Hydrocotyle ranunculoides L.fil.",
.data$scientific_name))
## mutate: changed one value (3%) of 'scientific_name' (0 new NA)
For the same reason, we add the authorship to Zizania latifolia (for more info, see issue #4718 in the Github’s GBIF portal feedback repository):
input_data <-
input_data %>%
mutate(scientific_name = ifelse(
.data$scientific_name == "Zizania latifolia",
"Zizania latifolia (Griseb.) Hance ex F.Muell.",
.data$scientific_name))
## mutate: changed one value (3%) of 'scientific_name' (0 new NA)
We reshape the vernacular names columns by creating two columns:
vernacular_name
and language
:
input_data %<>% pivot_longer(
cols= starts_with("vernacular"),
names_to = c("language"),
names_prefix = "vernacular_name_",
values_to = "vernacular_name"
)
## pivot_longer: reorganized (vernacular_name_en, vernacular_name_nl, vernacular_name_fr) into (language, vernacular_name) [was 30x11, now 90x10]
Preview:
input_data %>% head(10)
Some taxa have multiple vernacular names divided by
" / "
:
input_data %>% filter(
str_detect(.data$vernacular_name, pattern = " / ")
)
## filter: removed 86 rows (96%), 4 rows remaining
We split them in separate rows:
input_data %<>% separate_rows(vernacular_name, sep = " / ")
The input data are now tidy:
input_data
To link taxa with information in the extension(s), each taxon needs a
unique and relatively stable taxonID
. Here we create one in
the form of dataset_shortname:taxon:hash
, where
hash
is unique code based on scientific name and kingdom
(that will remain the same as long as scientific name and kingdom remain
the same):
vdigest <- Vectorize(digest) # Vectorize digest function to work with vectors
input_data %<>% mutate(taxon_id = paste(
"riparias-target-list",
"taxon",
vdigest(paste(scientific_name, kingdom), algo = "md5"),
sep = ":"
))
## mutate: new variable 'taxon_id' (character) with 30 unique values and 0% NA
Show the number of taxa and distributions per kingdom:
input_data %>%
group_by(kingdom) %>%
summarize(
`# taxa` = n_distinct(taxon_id),
`# rows` = n()
) %>%
adorn_totals("row")
## group_by: one grouping variable (kingdom)
## summarize: now 2 rows and 3 columns, ungrouped
Preview data:
input_data %>% head()
Create a dataframe with unique taxa only (ignoring eventual multiple distribution rows):
taxon <- input_data %>% distinct(taxon_id, .keep_all = TRUE)
## distinct: removed 64 rows (68%), 30 rows remaining
Map the data to Darwin Core Taxon.
Start with record-level terms which contain metadata about the dataset (which is generally the same for all records).
taxon %<>% mutate(dwc_language = "en")
## mutate: new variable 'dwc_language' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_license = "https://creativecommons.org/publicdomain/zero/1.0/")
## mutate: new variable 'dwc_license' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_rightsHolder = "INBO")
## mutate: new variable 'dwc_rightsHolder' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_datasetID = "https://doi.org/10.15468/p4ugqr")
## mutate: new variable 'dwc_datasetID' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_institutionCode = "INBO")
## mutate: new variable 'dwc_institutionCode' (character) with one unique value and 0% NA
taxon %<>% mutate(dwc_datasetName = "Checklist of LIFE RIPARIAS target species")
## mutate: new variable 'dwc_datasetName' (character) with one unique value and 0% NA
The following terms contain information about the taxon:
taxon %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 30 unique values and 0% NA
taxon %<>% mutate(dwc_scientificName = scientific_name)
## mutate: new variable 'dwc_scientificName' (character) with 30 unique values and 0% NA
Inspect values:
taxon %>%
group_by(kingdom) %>%
count()
## group_by: one grouping variable (kingdom)
## count: now 2 rows and 2 columns, one group variable remaining (kingdom)
Map values:
taxon %<>% mutate(dwc_kingdom = kingdom)
## mutate: new variable 'dwc_kingdom' (character) with 2 unique values and 0% NA
Map values by recoding to the GBIF rank vocabulary:
taxon %<>% mutate(dwc_taxonRank = case_when(
str_detect(dwc_scientificName, "^[A-Z][a-z]+ [a-z]+$") ~ "species",
# More specific mappings can be added here
str_detect(dwc_scientificName, "Hydrocotyle ranunculoides L.fil.") ~ "species",
str_detect(dwc_scientificName, "Zizania latifolia \\(Griseb.\\) Hance ex F.Muell.") ~ "species")
)
## mutate: new variable 'dwc_taxonRank' (character) with one unique value and 0% NA
Show unmapped values:
taxon %>%
filter(is.na(dwc_taxonRank)) %>%
select(dwc_scientificName)
## filter: removed all rows (100%)
## select: dropped 20 variables (scientific_name, gbif_key, kingdom, is_project_species, is_alert_species, …)
Only keep the Darwin Core columns:
taxon %<>% select(starts_with("dwc_"))
## select: dropped 11 variables (scientific_name, gbif_key, kingdom, is_project_species, is_alert_species, …)
Drop the dwc_
prefix:
colnames(taxon) <- str_remove(colnames(taxon), "dwc_")
Preview data:
taxon %>% head()
Save to CSV:
write_csv(taxon, here("data", "processed", "taxon.csv"), na = "")
Create a vernacular
data.frame from
input_data
containing only vernacular name information and
corresponding taxon IDs:
vernacular <-
input_data %>%
select(
taxon_id,
language,
vernacular_name
)
## select: dropped 8 variables (scientific_name, gbif_key, kingdom, is_project_species, is_alert_species, …)
vernacular
Remove rows with missing vernacular name:
vernacular %<>% filter(!is.na(vernacular_name))
## filter: no rows removed
Map the data to Vernacular Names.
vernacular %<>% mutate(dwc_taxonID = taxon_id)
## mutate: new variable 'dwc_taxonID' (character) with 30 unique values and 0% NA
vernacular %<>% mutate(dwc_vernacularName = vernacular_name)
## mutate: new variable 'dwc_vernacularName' (character) with 94 unique values and 0% NA
vernacular %<>% mutate(dwc_language = language)
## mutate: new variable 'dwc_language' (character) with 3 unique values and 0% NA
Only keep the Darwin Core columns:
vernacular %<>% select(starts_with("dwc_"))
## select: dropped 3 variables (taxon_id, language, vernacular_name)
Drop the dwc_
prefix:
colnames(vernacular) <- str_remove(colnames(vernacular), "dwc_")
Preview data:
vernacular %>% head()
Save to CSV:
write_csv(vernacular, here("data", "processed", "vernacularname.csv"), na = "")