tidyseqs workflow

R Markdown

Set folder path and seed:

library(tidyverse)
set.seed(1)

folder_path <- ("/Volumes/Macintosh HD/Users/rof011/symbiodinium/20230120T102936_esampayo")

1. extract sequences

see ?extract_seqs for more details. default output is “absolute”, can specify with type = “relative”

sequences <- extract_seqs(folder_path)

head(sequences)

## # A tibble: 6 × 3
## # Groups:   sample_name [1]
##   sample_name seq.ID    abundance
##   <chr>       <chr>         <int>
## 1 AU18_0062   C3             4060
## 2 AU18_0062   C1              322
## 3 AU18_0062   C21             247
## 4 AU18_0062   2778029_C       230
## 5 AU18_0062   C3.12           164
## 6 AU18_0062   C42.2           161

Only extract the seq.ID in with matches in the its2.profile

sequences <- extract_seqs(folder_path, onlyprofile=TRUE)

head(sequences)

## # A tibble: 6 × 3
## # Groups:   sample_name [2]
##   sample_name seq.ID abundance
##   <chr>       <chr>      <int>
## 1 AU18_0062   C3          4060
## 2 AU18_0062   C1           322
## 3 AU18_0062   C21          247
## 4 AU18_0062   C42.2        161
## 5 AU18_0062   C3an         152
## 6 AU18_0063   C3          3221

make up some metadata and add it to the file

meta_sim <- data.frame(sample_name = unique(sequences$sample_name),
                       host_genera = sample(c("Acropora", "Pocillopora", "Stylophora", "Merulina"), length(unique(sequences$sample_name)), replace = TRUE),
                       location = sample(c("Paris", "New York", "Milan", "Den Haag"), length(unique(sequences$sample_name)), replace = TRUE),
                       latitude = sampled_numbers <- sample(-10:-30, length(unique(sequences$sample_name)), replace = TRUE))


## under normal use import your metadata from a file and pass it to extract_seqs()

# metasim <- read.csv("file/to/meta.csv")

extract sequences with metadata:

sequences <- extract_seqs(folder_path, meta_sim)

head(sequences)

## # A tibble: 6 × 6
##   sample_name seq.ID    abundance host_genera location latitude
##   <chr>       <chr>         <int> <chr>       <chr>       <int>
## 1 AU18_0062   C3             4060 Acropora    New York      -17
## 2 AU18_0062   C1              322 Acropora    New York      -17
## 3 AU18_0062   C21             247 Acropora    New York      -17
## 4 AU18_0062   2778029_C       230 Acropora    New York      -17
## 5 AU18_0062   C3.12           164 Acropora    New York      -17
## 6 AU18_0062   C42.2           161 Acropora    New York      -17

2. filter sequences

a) drop samples by sample_name

sequences <- extract_seqs(folder_path,meta_sim) |>
             filter_seqs(drop_samples=c("AU18_0062","AU18_0063","AU18_0064","AU18_0065"))

unique(sequences$sample_name)

##   [1] "AU18_0067"  "AU18_0068"  "AU18_0069"  "AU18_0070"  "AU18_0071"  "AU18_0072"  "AU18_0087"  "AU18_0100"  "AU18_0108"  "AU18_0111"  "AU18_0112"  "AU18_0113" 
##  [13] "AU18_0115"  "AU18_0150"  "AU18_0162"  "AU18_0163"  "AU18_0164"  "AU18_0165"  "AU18_0170"  "AU18_0219"  "AU18_0220"  "AU18_0232"  "AU18_0234"  "AU18_0237" 
##  [25] "AU18_0241"  "AU18_0242"  "AU18_0243"  "AU18_0245"  "AU18_0248"  "AU18_0251"  "AU18_0254"  "AU18_0256"  "AU18_0257"  "AU18_0260"  "AU18_0281"  "AU18_0288" 
##  [37] "AU18_0311"  "AU18_0319"  "AU18_0331"  "AU18_0332"  "AU18_0352"  "AU18_0359"  "AU18_0387"  "AU18_0388"  "AU18_0389"  "AU18_0390"  "AU18_0432"  "AU18_0456" 
##  [49] "AU18_0457"  "AU18_0458"  "AU18_0459"  "AU18_0460"  "AU18_0463"  "AU18_0468"  "AU18_0469"  "AU18_0470"  "AU18_0472"  "AU18_0474"  "AU18_0482"  "AU18_0483" 
##  [61] "AU18_0484"  "AU18_0485"  "AU18_0520"  "AU18_0583"  "AU18_0586"  "AU18_0587"  "AU18_0588"  "AU18_0589"  "AU18_0590"  "AU18_0592"  "AU18_0596"  "AU18_0597" 
##  [73] "AU18_0599"  "AU18_0612"  "AU18_0614"  "AU18_0615"  "AU18_0617"  "AU18_0632"  "AU18_0685"  "AU18_0686"  "AU18_0688"  "AU18_0692"  "AU18_0695"  "AU18_0696" 
##  [85] "AU18_0700"  "AU18_0702"  "AU18_0703"  "AU18_0737"  "AU18_0742"  "AU18_0743"  "AU18_0744"  "AU18_0745"  "AU18_0747"  "AU18_0748"  "AU18_0821"  "AU18_0824" 
##  [97] "AU18_0876"  "AU18_0895"  "AU18_0908"  "AU18_0910"  "AU18_0961"  "AU18_0962"  "AU18_0963"  "AU18_0964"  "AU18_0965"  "AU18_0966"  "AU18_0967"  "AU18_0970" 
## [109] "AU18_1010"  "AU18_1026"  "AU18_1031"  "AU18_1032"  "AU18_1033"  "AU18_1034"  "AU18_1035"  "AU18_1037"  "AU18_1038"  "AU18_1039"  "AU18_1040"  "AU18_1041" 
## [121] "AU18_1058"  "AU18_1060"  "AU18_1071"  "AU18_1072"  "AU18_1075"  "AU18_1078"  "AU18_1079"  "AU18_1081"  "AU18_1083"  "AU18_1111"  "AU18_1121"  "AU18_1122" 
## [133] "AU18_1123"  "AU18_1124"  "AU18_1125"  "AU18_1126"  "AU18_1128"  "AU18_1129"  "AU18_1138"  "AU18_1143"  "AU18_1172"  "AU18_neg12"

or if you already have a list:

remove_these_seqs <- c("AU18_0062","AU18_0063","AU18_0064","AU18_0065")

sequences <- extract_seqs(folder_path,meta_sim) |>
  filter_seqs(drop_samples=remove_these_seqs)

b) keep samples by sample_name

sequences <- extract_seqs(folder_path, meta_sim) |>
             filter_seqs(keep_samples=c("AU18_0062","AU18_0063","AU18_0064","AU18_0065"))

unique(sequences$sample_name)

## [1] "AU18_0062" "AU18_0063" "AU18_0064" "AU18_0065"

c) drop samples by seq.ID

Note partial matching doesn’t work with seq.ID - e.g. selecting “C3” would remove “C3eq” etc.

sequences <- extract_seqs(folder_path) |>
             filter_seqs(drop_seqs=c("C3xe","C3xe","C1","C3ih"))

unique(sequences$seq.ID)

##   [1] "C3"        "C21"       "2778029_C" "C3.12"     "C42.2"     "C3an"      "40627_C"   "2426_C"    "1398203_C" "333087_C"  "54400_C"   "C39ab"     "2885773_C"
##  [14] "2595310_C" "39266_C"   "24192_C"   "C3ed"      "54364_C"   "6379_C"    "C3.10"     "9945_C"    "C3tq"      "103346_C"  "42703_C"   "66904_C"   "23580_C"  
##  [27] "C3ag"      "C3dw"      "13723_C"   "C3ab"      "C6a"       "2885777_C" "2885797_C" "2885774_C" "1428436_C" "100944_C"  "1401978_C" "C3ub"      "C1b"      
##  [40] "C3xf"      "C1gw"      "107479_C"  "C1ab"      "C1by"      "278761_C"  "766366_C"  "C120b"     "1031429_C" "C42ao"     "C3mk"      "47106_C"   "47105_C"  
##  [53] "44745_C"   "2783442_C" "2783544_C" "147220_C"  "92851_C"   "90920_C"   "150940_C"  "1491900_C" "1150766_C" "392267_C"  "1029753_C" "93539_C"   "255526_C" 
##  [66] "2783481_C" "91281_C"   "2885772_C" "C3tv"      "2885778_C" "2885776_C" "2885775_C" "2885766_C" "2786470_C" "1670177_C" "20466_C"   "2885767_C" "2783601_C"
##  [79] "48654_C"   "17505_C"   "46331_C"   "C1lh"      "2784993_C" "C3vb"      "2885800_C" "38496_C"   "C3od"      "1026568_C" "C8"        "C79b"      "2885740_C"
##  [92] "1163943_C" "C42e"      "1685230_C" "C22"       "C8a"       "2783357_C" "1631480_C" "42216_C"   "C22ad"     "C22ah"     "1611788_C" "1645568_C" "1148411_C"
## [105] "1147954_C" "1630543_C" "1622915_C" "2786619_C" "37253_C"   "1670084_C" "1645678_C" "1611789_C" "C1nu"      "1626916_C" "2174376_C" "21025_C"   "1646063_C"
## [118] "1630588_C" "1645877_C" "C45k"      "C3xd"      "C41z"      "2783359_C" "93790_C"   "1328437_C" "24569_C"   "2780845_C" "94627_C"   "2885768_C" "C115d"    
## [131] "2885769_C" "D1"        "D6"        "D4"        "25757_D"   "D1d"       "D6c"       "172931_D"  "D4ae"      "25566_D"   "C1nt"      "C1ns"      "14636_C"  
## [144] "1055322_C" "262320_C"  "2885788_C" "1612135_C" "C3hi"      "2885804_C" "1669691_C" "1114496_C" "2885732_C" "2885757_C" "965969_C"  "2885735_C" "2885733_C"
## [157] "2885736_C" "2885758_C" "1314109_C" "2885734_C" "C1cu"      "948418_C"  "1962377_C" "135129_C"  "2885759_C" "2286672_C" "2783498_C" "27913_C"   "2885787_C"
## [170] "C3t"       "2885789_C" "1673_D"    "1677_D"    "D6g"       "132798_D"  "D1dm"      "1624473_C" "2885737_C" "2885748_C" "208_C"     "2885749_C" "22178_C"  
## [183] "2783458_C" "91418_C"   "962494_C"  "2885751_C" "2885750_C" "2785003_C" "1648847_C" "C22ai"     "1633580_C" "2885786_C" "C78c"      "103687_C"  "1628745_C"
## [196] "1618671_C" "2786412_C" "C79"       "C3gk"      "722736_C"  "D10"       "D17d"      "D1b"       "2282565_C" "2623441_C" "2885744_C" "2282571_C" "2885745_C"
## [209] "2885746_C" "2282558_C" "1670755_C" "1670426_C" "2885747_C" "C1go"      "1612485_C" "1642017_C" "1670629_C" "1634089_C" "C1id"      "2885798_C" "4063_C"   
## [222] "42222_C"   "75670_C"   "2885799_C" "21810_C"   "2091_C"    "918687_C"  "2783362_C" "2885806_C" "1646070_C" "1670723_C" "1638047_C" "D1ej"      "2184426_D"
## [235] "D1ja"      "909_D"     "214657_D"  "145593_D"  "764365_D"  "D17e"      "D17c"      "2885785_D" "2885784_D" "2282560_C" "D1r"       "C3cl"      "19548_C"  
## [248] "C1dy"      "C3wo"      "C1gy"      "1194550_C" "414389_C"  "159911_C"  "C9a"       "1655376_C" "93186_C"   "C1w"       "1183302_C" "2885753_C" "95341_C"  
## [261] "2885763_C" "15923_C"   "1707858_C" "2885752_C" "C1hj"      "1655375_C" "170670_C"  "2885764_C" "1712853_C" "C1it"      "141464_C"  "93899_C"   "29115_C"  
## [274] "2885754_C" "2885755_C" "C54l"      "2175409_C" "C35.3"     "870372_C"  "C40ap"     "2239314_C" "C42bw"     "2779481_C" "2239310_C" "2239838_C" "2239747_C"
## [287] "502804_C"  "1626763_C" "C35.1"     "C1au"      "2175412_C" "21721_C"   "C42a"      "2239316_C" "C79a"      "1084218_C" "2885765_C" "2239344_C" "2230005_C"
## [300] "1766700_C" "C116bu"    "4695_C"    "G3a"       "C42ae"     "C3eq"      "C1gj"      "2778862_C" "2778250_C" "2778140_C" "2777950_C" "1630513_C" "2885807_C"
## [313] "C50b"      "C3bm"      "C1c"       "C50cg"     "C50f"      "22134_C"   "C3.2"      "C1bh"      "C1br"      "C1cb"      "C1gq"      "C1cv"      "C42ay"    
## [326] "C1al"      "C50v"      "3289_C"    "21067_C"   "22284_C"   "22281_C"   "22136_C"   "772549_D"  "D2.2"      "C1am"      "C72k"      "C1ao"      "C1an"     
## [339] "1645787_C" "1680592_C" "1626827_C" "1698762_C" "1708730_C" "1645716_C" "C50a"      "266584_C"  "268040_C"  "C3wf"      "C50aq"     "C50t"      "C50ar"    
## [352] "C3ju"      "21507_C"   "C1jx"      "C1ge"      "1626350_C" "C54b"      "C1j"       "1637611_C" "47056_C"   "1662001_C" "1692949_C" "2885742_C" "1351_C"   
## [365] "C3n"       "253897_D"  "141145_D"  "17682_D"   "2885760_D" "D2c"       "2885762_C" "222919_D"  "460098_D"  "13356_D"   "10616_D"   "D2j"       "D2a"      
## [378] "2885761_D" "1613357_C" "C3be"      "1714311_C" "C3bf"      "25889_C"   "C3si"      "C3sk"      "2885770_C" "65155_C"   "1630681_C" "1653201_C" "2885790_C"
## [391] "1670421_C" "C42dr"     "D1id"      "D4k"       "D2"        "2175598_D" "223603_D"  "281318_D"  "2885743_D" "53111_D"   "2885795_C" "D17j"      "D17l"     
## [404] "2282561_C" "D1u"       "1669953_C" "1645719_C" "2885794_C" "24193_C"   "2885796_C" "1618884_C" "2785985_C" "2784604_C" "1669316_C" "1630842_C" "2885771_C"
## [417] "D17al"     "7488_D"    "40982_D"   "10179_D"   "1645836_C" "C1n"       "1630350_C" "19245_D"   "1658362_C" "A1"        "1670160_C" "C3co"      "90777_C"  
## [430] "2853_C"    "123623_C"  "92898_C"   "2885739_C" "C3da"      "2885738_C" "2885756_C" "1073761_C" "20946_C"   "1062929_C" "25715_C"   "42219_C"   "281_C"    
## [443] "D1af"      "32528_D"   "1626302_C" "1670248_C" "1170996_D" "2885791_D" "6624_D"    "25651_D"   "C3ew"      "C3sa"      "C42au"     "2885805_C" "2885741_C"
## [456] "69388_C"   "23615_D"   "1242406_D" "28248_D"   "104830_D"  "1675094_C" "7279_D"    "10626_D"   "2885802_D" "102838_D"  "C1hq"      "2850_C"    "2885792_C"
## [469] "2885793_C" "1630270_C" "1645825_C" "D1ih"      "2043_C"    "2885801_D" "57565_C"   "11027_D"   "1163651_C" "181666_D"  "57566_C"   "2885803_C" "1714620_C"
## [482] "1658901_C" "C15"       "C3lk"      "C3ac"      "C3dr"      "2885780_C" "C3ax"      "2175400_C" "90635_C"   "2175451_C" "C78"       "C1ci"      "2779405_C"
## [495] "1770278_C" "33296_C"   "1670517_C" "C75h"      "2239404_C" "C8e"       "C15.8"     "G3ai"      "2885779_G" "C15bb"     "7528_C"    "2885781_C" "2239488_C"
## [508] "73441_G"   "C42ea"     "C1ef"      "C15ed"     "C116t"     "2885782_C" "2239592_C" "2230021_C" "C8n"       "99803_C"   "870396_C"  "756079_C"  "463034_C" 
## [521] "C54m"      "870369_C"  "8017_C"    "791026_C"  "2885783_C" "2315062_C" "2175452_C" "2175407_C" "1714031_C" "1703694_C"

d) keep samples by seq.ID

sequences <- extract_seqs(folder_path) |>
             filter_seqs(keep_seqs=c("C3", "C3xe","C3xe","C1","C3ih"))

e) remove seq.ID below a threshold

sequences <- extract_seqs(folder_path) |>
             filter_seqs(threshold=10000)

f) filter by clade

select genus/clade to keep:

sequences <- extract_seqs(folder_path) |>
             filter_seqs(clade="D")

g) filter by metadata

if you include metadata in extract_seqs, you can use any tidyverse filter() action. for example:

sequences <- extract_seqs(folder_path, meta_sim) %>%
             filter(host_genera=="Pocillopora") %>%
             filter(location=="Den Haag") %>% 
             filter(latitude < -25)

combining filters

sequences <- extract_seqs(folder_path) |>
             filter_seqs(drop_samples=c("AU18_0062","AU18_0063","AU18_0064","AU18_0065"),
                         keep_seqs=c("C3", "C3xe","C1","C3ih"), 
                         threshold=10000)

or:

sequences <- extract_seqs(folder_path, type="absolute") |>
             filter_seqs(drop_samples=c("AU18_0062","AU18_0063","AU18_0064","AU18_0065")) |> 
             filter_seqs(keep_seqs=c("C3", "C3xe","C1","C3ih"))

3. Quality control:

a) compare sample lists

explore which samples are dropped:

all_samples <- extract_seqs(folder_path) 
subset_samples <- extract_seqs(folder_path) |> 
                  filter_seqs(drop_samples=c("AU18_0062","AU18_0063","AU18_0064","AU18_0065"))

compare_seqs(all_samples, subset_samples)

##  [1] "C3ag"      "13723_C"   "C3ab"      "2885777_C" "2885797_C" "100944_C"  "1401978_C" "C3ub"      "392267_C"  "93539_C"   "255526_C"  "2885772_C" "C3tv"     
## [14] "2885778_C" "2885776_C" "2885775_C"

a) compare seq lists

explore which seqs are filtered:

all_seqs <- extract_seqs(folder_path) 
subset_seqs <- extract_seqs(folder_path, type="absolute") |> 
               filter_seqs(threshold=5, type="relative")

compare_seqs(all_seqs, subset_seqs)

##  [1] "2885778_C" "2885776_C" "2885775_C" "1631480_C" "2885737_C" "962494_C"  "2885751_C" "2885750_C" "2785003_C" "2786412_C" "722736_C"  "2885785_D" "2885784_D"
## [14] "C1it"      "141464_C"  "C3eq"      "C1gj"      "2778862_C" "2778250_C" "2778140_C" "2777950_C" "65155_C"   "C42dr"     "40982_D"   "A1"        "25651_D"

Note: extract_seqs() can extract data either absolute abundance or relative abundance. Similarly, filter_seqs() can output the format either in absolute abundance or relative abundance (the default is type=“relative”).

If you pass relative abundance data from extract_seqs() and try to filter by threshold tidyseqs will fail the following error message:

# extract_seqs(folder_path, type="relative") |> 
#   filter_seqs(threshold=5, type="absolute")

4. plot the data:

Plot the raw data (note - this has to be in a pipe as plot_seqs needs the folder from extract_seqs to get the colors)

plot_data <- extract_seqs(folder_path, type="relative") |> 
              plot_seqs(type="ggplot")

plot_data

Add filter_seqs before the plot to select what to view:

plot_data <- extract_seqs(folder_path) |>
             filter_seqs(threshold = 5000) |> 
             filter_seqs(keep_samples = "AU18_02") |> 
             plot_seqs(type="ggplot")

plot_data

cluster samples by either “bray-curtis”, “euclidean”, “jaccard”, “hellinger”, save using ggsave

plot_data <- extract_seqs(folder_path, meta_sim) |>
             filter_seqs(threshold = 5000) |> 
             filter_seqs(keep_samples = "AU18_02") |> 
             plot_seqs(type="ggplot", cluster="hellinger")

plot_data

Add facet_wrap or facet_grid by using standard ggplot code:

plot_data <- extract_seqs(folder_path, meta_sim) |>
             filter_seqs(threshold = 1000) |> 
             plot_seqs(type="ggplot", facet="host_genera") 

plot_data

plot with plotly instead of ggplot for dynamic plots:

plot_data <- extract_seqs(folder_path, meta_sim) |>
             filter_seqs(threshold = 2000) |> 
             plot_seqs(type="plotly", cluster="hellinger", facet="location") 

plot_data

Split a large dataset using nrow (note: can’t be used with facet)

plot_data <- extract_seqs(folder_path, meta_sim) |>
             filter_seqs(threshold = 2000) |> 
             plot_seqs(type="plotly", cluster="hellinger", nrow=3) 

plot_data