6 min read

Data quality check

Read in data from 2009 to 2020 in long format.

dat_phenophase
## # A tibble: 4,116,485 × 16
##    site  canopy heat  heat_name water water_name block plot  species barcode
##    <chr> <chr>  <fct> <fct>     <fct> <fct>      <chr> <chr> <chr>     <dbl>
##  1 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  2 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  3 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  4 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  5 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  6 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  7 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  8 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
##  9 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
## 10 cfc   closed _     ambient   _     ambient    a     a3    abiba      3148
## # ℹ 4,116,475 more rows
## # ℹ 6 more variables: cohort <dbl>, year <dbl>, doy <dbl>, phenophase <fct>,
## #   status <dbl>, notes <chr>

Question: What do I do about those with non-NA notes?

set.seed(1)
dat_phenophase %>%
  filter(!is.na(notes)) %>%
  group_by(notes) %>%
  select(plot, species, barcode, year, doy, phenophase, status, notes) %>%
  sample_n(1)
## # A tibble: 2,565 × 8
## # Groups:   notes [2,565]
##    plot  species barcode  year   doy phenophase status notes                    
##    <chr> <chr>     <dbl> <dbl> <dbl> <fct>       <dbl> <chr>                    
##  1 l7    rhaca     40843  2019   108 budbreak        0 "\""                     
##  2 k1    pinst      6844  2011   252 senescence      1 "\"\"dupliate deleted ba…
##  3 l2    poptr     40316  2020   310 budbreak        0 "\"not so much scenesced…
##  4 f3    acesa      2209  2011   137 oneleaf         0 "\"row added based on lh…
##  5 f6    fraal     38220  2018   121 leafdrop        0 "\"so close\""           
##  6 j8    rhaca     39201  2019   108 mostleaf        0 "(formerly k13)"         
##  7 h5    picgl     34593  2017   306 oneleaf         0 "(missing measurement? -…
##  8 k6    acesa     39967  2019   108 mostleaf        0 "(now j2)"               
##  9 k7    betpa     40087  2019   108 mostleaf        0 "(now some l11)"         
## 10 a3    picma     31319  2018   121 budbreak        0 "(picma?)"               
## # ℹ 2,555 more rows

Multiple entries conflicting

dat_phenophase_full <- tidy_phenophase(distinct = F)
dat_phenophase_dup_group <- anti_join(dat_phenophase_full, dat_phenophase) %>%
  distinct(site, canopy, heat, heat_name, water, water_name, block, plot, species, barcode, cohort, year, doy, phenophase, notes)

dat_phenophase_dup <- dat_phenophase_full %>%
  right_join(dat_phenophase_dup_group)

dat_phenophase_dup %>%
  arrange(site, canopy, heat, heat_name, water, water_name, block, plot, species, barcode, cohort, year, doy, phenophase, notes) %>%
  select(plot, species, barcode, year, doy, phenophase, status, notes)
## # A tibble: 3,799 × 8
##    plot  species barcode  year   doy phenophase status notes
##    <chr> <chr>     <dbl> <dbl> <dbl> <fct>       <dbl> <chr>
##  1 b8    picgl     32534  2018   114 budbreak        0 <NA> 
##  2 b8    picgl     32534  2018   114 oneleaf         0 <NA> 
##  3 b8    picgl     32534  2018   114 mostleaf        0 <NA> 
##  4 b8    picgl     32534  2018   114 senescence      0 <NA> 
##  5 b8    picgl     32534  2018   114 leafdrop        0 <NA> 
##  6 b8    picgl     32534  2018   121 budbreak        0 <NA> 
##  7 b8    picgl     32534  2018   121 oneleaf         0 <NA> 
##  8 b8    picgl     32534  2018   121 mostleaf        0 <NA> 
##  9 b8    picgl     32534  2018   121 senescence      0 <NA> 
## 10 b8    picgl     32534  2018   121 leafdrop        0 <NA> 
## # ℹ 3,789 more rows

Time of budbreak

dat_phenophase_time <- calc_phenophase_time(dat_phenophase)
p_phenophase_change <- plot_phenophase_change(dat_phenophase_time)
p_phenophase_change$budbreak
  • 2012 seems to have more late outliers.
  • If they were second flush, why were first flush never observed?

Example of outlier.

set.seed(1)
dat_phenophase_outlier <- dat_phenophase_time %>%
  filter(phenophase == "budbreak", doy > 180) %>%
  sample_n(1)
dat_phenophase_outlier %>% select(plot, species, barcode, year, phenophase, doy, notes)
## # A tibble: 1 × 7
##   plot  species barcode  year phenophase   doy notes
##   <chr> <chr>     <dbl> <dbl> <fct>      <dbl> <chr>
## 1 k5    picgl     20035  2013 budbreak     193 <NA>
dat_phenophase %>%
  right_join(dat_phenophase_outlier %>% select(-doy)) %>%
  filter(abs(doy - dat_phenophase_outlier$doy) <= 30) %>%
  select(plot, species, barcode, year, doy, phenophase, status, notes)
## # A tibble: 17 × 8
##    plot  species barcode  year   doy phenophase status notes
##    <chr> <chr>     <dbl> <dbl> <dbl> <fct>       <dbl> <chr>
##  1 k5    picgl     20035  2013   165 budbreak        0 <NA> 
##  2 k5    picgl     20035  2013   168 budbreak        0 <NA> 
##  3 k5    picgl     20035  2013   171 budbreak        0 <NA> 
##  4 k5    picgl     20035  2013   175 budbreak        0 <NA> 
##  5 k5    picgl     20035  2013   179 budbreak        0 <NA> 
##  6 k5    picgl     20035  2013   182 budbreak        0 <NA> 
##  7 k5    picgl     20035  2013   185 budbreak        0 <NA> 
##  8 k5    picgl     20035  2013   189 budbreak        0 <NA> 
##  9 k5    picgl     20035  2013   193 budbreak        1 <NA> 
## 10 k5    picgl     20035  2013   197 budbreak        1 <NA> 
## 11 k5    picgl     20035  2013   200 budbreak        1 <NA> 
## 12 k5    picgl     20035  2013   204 budbreak        1 <NA> 
## 13 k5    picgl     20035  2013   207 budbreak        1 <NA> 
## 14 k5    picgl     20035  2013   211 budbreak        1 <NA> 
## 15 k5    picgl     20035  2013   214 budbreak        0 <NA> 
## 16 k5    picgl     20035  2013   218 budbreak        0 <NA> 
## 17 k5    picgl     20035  2013   221 budbreak        1 <NA>

Decided to remove outliers manually using time window.

dat_phenophase_time_clean <- calc_phenophase_time(dat_phenophase, remove_outlier = T)
dat_phenophase_clean <- tidy_phenophase(remove_outlier = T, dat_phenophase_time = dat_phenophase_time_clean)

setwd("phenologyb4warmed/")
usethis::use_data(dat_phenophase_time_clean, overwrite = T)
usethis::use_data(dat_phenophase_clean, overwrite = T)
setwd("..")
p_phenophase_change <- plot_phenophase_change(dat_phenophase_time_clean)
p_phenophase_change$budbreak
Better now.

Phenophases to ordinal data

dat_phenophase_ordinal <- tidy_phenophase_ordinal(dat_phenophase, season = "spring", keepfirst = F)
dat_phenophase_ordinal %>%
  select(plot, species, barcode, year, doy, budbreak, oneleaf, mostleaf, stage) %>%
  distinct(budbreak, oneleaf, mostleaf, stage, .keep_all = T)
## # A tibble: 8 × 9
##   plot  species barcode  year   doy budbreak oneleaf mostleaf stage   
##   <chr> <chr>     <dbl> <dbl> <dbl>    <dbl>   <dbl>    <dbl> <fct>   
## 1 a3    abiba      3148  2010    82        0       0        0 <NA>    
## 2 a3    abiba      3148  2010   137        1       0        0 budbreak
## 3 a3    abiba      3148  2010   154        1       1        0 oneleaf 
## 4 a3    abiba      3148  2010   168        1       1        1 mostleaf
## 5 a3    abiba      3148  2011   189        0       1        0 oneleaf 
## 6 a3    abiba      3148  2011   208        0       0        1 mostleaf
## 7 a5    picgl      3440  2011   193        1       0        1 mostleaf
## 8 a5    queru      3491  2011   193        0       1        1 mostleaf
  • Does all No mean dormancy? (No.)
  • If not, how to get dormancy? (Can assume that any time after we stopped surveys in the fall and before we started surveys in the spring is the dormancy time.)
  • Yes for multiple categories? (Because some data are filled in, not observed. Use time of first yes only.)

Climate data

dat_climate_daily <- read_climate(option = "daily")
plot_climate(dat_climate_daily, plotoi = "a1")
  • Maximum hourly rainfall seems to be 2160 inches. Was there something wrong? (Now fixed.)
  • Were the rainfall at reduced rainfall plots measured? (No. Now rainfall removal flag implemented.)
  • Difference in aboveground temperature with ambient plot seems big in 2014 and 2019? (Not sure yet.)