Skip to content

Commit

Permalink
updated fst
Browse files Browse the repository at this point in the history
  • Loading branch information
thierrygosselin committed Mar 15, 2023
1 parent 2d5cc95 commit 2eb1c49
Show file tree
Hide file tree
Showing 64 changed files with 2,195 additions and 1,568 deletions.
Binary file modified .DS_Store
Binary file not shown.
12 changes: 6 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: assigner
Type: Package
Title: Assignment Analysis with RADseq Data
Version: 0.5.8
Date: 2020-10-27
Version: 0.5.9
Date: 2023-03-15
Encoding: UTF-8
Authors@R: c(
person("Thierry", "Gosselin", email = "[email protected]", role = c("aut", "cre")),
Expand All @@ -19,9 +19,7 @@ Imports:
dplyr (>= 1.0.0),
furrr (>= 0.2.0),
magrittr,
radiator,
tidyverse,
vroom
radiator
Suggests:
BiocManager,
carrier,
Expand All @@ -42,13 +40,15 @@ Suggests:
tibble,
tidyr,
tidyselect,
tidyverse,
vroom,
utils
Remotes: thierrygosselin/radiator
License: GPL-3
LazyLoad: yes
LazyData: true
VignetteBuilder:
knitr
RoxygenNote: 7.1.1
RoxygenNote: 7.2.3
URL: http://thierrygosselin.github.io/assigner/
BugReports: https://github.com/thierrygosselin/assigner/issues
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# assigner 0.5.9 2023-03-15

* work on *assigner::fst_WC84*


# assigner 0.5.8 2020-10-20

* Version bump because it updates numerous packages: `tidyr`, `readr`, using `future`, `carrier`.
Expand Down
47 changes: 21 additions & 26 deletions R/fst_NEI87.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,13 @@
#' \strong{Wide format:}
#' The wide format cannot store metadata info.
#' The wide format starts with these 2 id columns:
#' \code{INDIVIDUALS}, \code{POP_ID} (that refers to any grouping of individuals),
#' \code{INDIVIDUALS}, \code{STRATA} (that refers to any grouping of individuals),
#' the remaining columns are the markers in separate columns storing genotypes.
#'
#' \strong{Long/Tidy format:}
#' The long format is considered to be a tidy data frame and can store metadata info.
#' (e.g. from a VCF see \pkg{radiator} \code{\link{tidy_genomic_data}}). A minimum of 4 columns
#' are required in the long format: \code{INDIVIDUALS}, \code{POP_ID},
#' are required in the long format: \code{INDIVIDUALS}, \code{STRATA},
#' \code{MARKERS or LOCUS} and \code{GENOTYPE or GT}. The rest are considered metata info.
#'
#' \strong{2 genotypes formats are available:}
Expand Down Expand Up @@ -290,21 +290,16 @@ fst_NEI87 <- function(
# message("strata file: yes")
number.columns.strata <- max(utils::count.fields(strata, sep = "\t"))
col.types <- stringi::stri_join(rep("c", number.columns.strata), collapse = "")
suppressMessages(strata.df <- readr::read_tsv(file = strata, col_names = TRUE, col_types = col.types) %>%
dplyr::rename(POP_ID = STRATA))
suppressMessages(
strata.df <- readr::read_tsv(file = strata, col_names = TRUE, col_types = col.types)
)
} else {
# message("strata object: yes")
colnames(strata) <- stringi::stri_replace_all_fixed(
str = colnames(strata),
pattern = "STRATA",
replacement = "POP_ID",
vectorize_all = FALSE
)
strata.df <- strata
}

# Remove potential whitespace in pop_id
strata.df$POP_ID <- stringi::stri_replace_all_fixed(strata.df$POP_ID, pattern = " ", replacement = "_", vectorize_all = FALSE)
strata.df$STRATA <- stringi::stri_replace_all_fixed(strata.df$STRATA, pattern = " ", replacement = "_", vectorize_all = FALSE)

strata.df$INDIVIDUALS <- stringi::stri_replace_all_fixed(
str = strata.df$INDIVIDUALS,
Expand All @@ -314,7 +309,7 @@ fst_NEI87 <- function(
)

input <- input %>%
dplyr::select(-POP_ID) %>%
dplyr::select(-STRATA) %>%
dplyr::left_join(strata.df, by = "INDIVIDUALS")
}

Expand All @@ -323,14 +318,14 @@ fst_NEI87 <- function(

# subsampling data------------------------------------------------------------
# create the subsampling list
ind.pop.df <- dplyr::distinct(.data = input, POP_ID, INDIVIDUALS)
ind.pop.df <- dplyr::distinct(.data = input, STRATA, INDIVIDUALS)

if (is.null(subsample)) {
iteration.subsample <- 1
} else {
if (subsample == "min") {
subsample <- ind.pop.df %>%
dplyr::group_by(POP_ID) %>%
dplyr::group_by(STRATA) %>%
dplyr::tally(.) %>%
dplyr::filter(n == min(n)) %>%
dplyr::ungroup(.) %>%
Expand Down Expand Up @@ -647,11 +642,11 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
A2 = stringi::stri_sub(GT, 4,6)
) %>%
dplyr::select(-GT) %>%
assigner::rad_long(x = ., cols = c("MARKERS", "INDIVIDUALS", "POP_ID"), names_to = "ALLELES", values_to = "GT")
assigner::rad_long(x = ., cols = c("MARKERS", "INDIVIDUALS", "STRATA"), names_to = "ALLELES", values_to = "GT")

# frequency per markers, alleles, pop
p <- x %>%
dplyr::group_by(MARKERS, POP_ID) %>%
dplyr::group_by(MARKERS, STRATA) %>%
dplyr::count(GT) %>%
dplyr::mutate(P = n / sum(n)) %>%
dplyr::select(-n) %>%
Expand All @@ -660,7 +655,7 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
# mp: mean frequency per markers
# mp2: sum of square mean frequency per markers
mean.p2 <- p %>%
tidyr::complete(data = ., POP_ID, tidyr::nesting(MARKERS, GT), fill = list(P = 0)) %>%
tidyr::complete(data = ., STRATA, tidyr::nesting(MARKERS, GT), fill = list(P = 0)) %>%
dplyr::group_by(MARKERS, GT) %>%
dplyr::summarise(MP = mean(P, na.rm = TRUE)) %>%
dplyr::group_by(MARKERS) %>%
Expand All @@ -669,7 +664,7 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(

# msp2 mean frequency per markers per pop
mean.frequency.markers <- p %>%
dplyr::group_by(MARKERS, POP_ID) %>%
dplyr::group_by(MARKERS, STRATA) %>%
dplyr::summarise(SP2 = sum(P^2)) %>%
dplyr::group_by(MARKERS) %>%
dplyr::summarise(MSP2 = mean(SP2, na.rm = TRUE)) %>%
Expand All @@ -679,11 +674,11 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
# Mean heterozygosity observed per pop and markers
# mean heterozygosity across all markers
mean.het.obs.markers <- x %>%
dplyr::distinct(MARKERS, POP_ID, INDIVIDUALS, GT) %>%
dplyr::group_by(MARKERS, POP_ID, INDIVIDUALS) %>%
dplyr::distinct(MARKERS, STRATA, INDIVIDUALS, GT) %>%
dplyr::group_by(MARKERS, STRATA, INDIVIDUALS) %>%
dplyr::tally(.) %>%
dplyr::mutate(HO = n - 1) %>%
dplyr::group_by(POP_ID, MARKERS) %>%
dplyr::group_by(STRATA, MARKERS) %>%
dplyr::summarise(HO = mean(HO)) %>%
dplyr::group_by(MARKERS) %>%
dplyr::summarise(HO = mean(HO)) %>%
Expand All @@ -692,7 +687,7 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
# mn: corrected mean number of individuals per markers
#n: number of individuals, per pop and markers
fst.data <- x %>%
dplyr::group_by(POP_ID, MARKERS) %>%
dplyr::group_by(STRATA, MARKERS) %>%
dplyr::distinct(INDIVIDUALS) %>%
dplyr::tally(.) %>%
dplyr::mutate(N_INV = 1 / n) %>%
Expand Down Expand Up @@ -853,8 +848,8 @@ pairwise_fst_nei <- carrier::crate(function(
# list.pair <- 2
pop.select <- stringi::stri_join(purrr::flatten(pop.pairwise[list.pair]))
data.select <- data.genotyped %>%
dplyr::filter(POP_ID %in% pop.select) %>%
dplyr::mutate(POP_ID = droplevels(x = POP_ID))
dplyr::filter(STRATA %in% pop.select) %>%
dplyr::mutate(STRATA = droplevels(x = STRATA))
fst.select <- assigner::compute_fst_nei(x = data.select, ci = ci, iteration.ci = iteration.ci, quantiles.ci = quantiles.ci, digits = digits)
# if (ci){
df.select <- tibble::tibble(POP1 = pop.select[1], POP2 = pop.select[2])
Expand Down Expand Up @@ -895,7 +890,7 @@ fst_nei_subsample <- function(
}

# Keep only the subsample
input <- dplyr::semi_join(input, x, by = c("POP_ID", "INDIVIDUALS"))
input <- dplyr::semi_join(input, x, by = c("STRATA", "INDIVIDUALS"))
x <- NULL #unused object

# genotyped data and holdout sample -----------------------------------------
Expand All @@ -915,7 +910,7 @@ fst_nei_subsample <- function(
# Compute pairwise Fst -------------------------------------------------------
if (pairwise) {
if (verbose) message("Paiwise fst calculation")
pop.list <- levels(input$POP_ID) # pop list
pop.list <- levels(input$STRATA) # pop list
# all combination of populations
pop.pairwise <- utils::combn(unique(pop.list), 2, simplify = FALSE)
# Fst for all pairwise populations
Expand Down
95 changes: 54 additions & 41 deletions R/fst_WC84.R
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ fst_WC84 <- function(
dplyr::arrange(POP_ID)

# strip the data -------------------------------------------------------------
strata.bk <- markers.meta.bk <- genotypes.meta.bk <- NULL
# increases speed for large datasets
env.arg <- rlang::current_env()
data %<>%
radiator::strip_rad(
Expand Down Expand Up @@ -417,7 +417,8 @@ fst_WC84 <- function(
.x = 1:iteration.subsample,
.f = subsampling_data,
strata = strata.bk,
subsample = subsample
subsample = subsample,
random.seed = NULL
)

# keep track of subsampling individuals and write to directory
Expand Down Expand Up @@ -488,7 +489,7 @@ fst_WC84 <- function(

# test1 <- res$pairwise.fst
# test2 <- res$pairwise.fst.upper.matrix

# test3 <- res$pairwise.fst.full.matrix

# pairwise.fst.upper.matrix
# pairwise.fst.full.matrix
Expand Down Expand Up @@ -1518,11 +1519,6 @@ heatmap_fst <- function(
if (is.null(pop.levels)) {
pop.levels <- colnames(data.fst)
}
# else {
# if (length(pop.levels) != length(union(rownames(data.fst), colnames(data.fst)))) {
# rlang::abort(message = "Contact author, problem with strata levels in heatmap")
# }
# }

data.fst %<>%
radiator::distance2tibble(
Expand All @@ -1536,7 +1532,6 @@ heatmap_fst <- function(
magrittr::set_colnames(x = ., value = c("POP1", "POP2", "FST"))

inv.levels <- rev(levels(data.fst$POP2))
# pop.levels <- levels(data.fst$POP2)
rounder <- function(x, digits) round(as.numeric(x), digits)
if (max(stringi::stri_length(data.fst$FST), na.rm = TRUE) != digits) {
round.num <- TRUE
Expand All @@ -1560,7 +1555,6 @@ heatmap_fst <- function(


data.fst %<>% dplyr::left_join(data.ci, by = c("POP1", "POP2"))
# median.fst <- median(x = data.fst$FST, na.rm = TRUE)
mean.fst <- mean(x = data.fst$FST, na.rm = TRUE)
min.fst <- min(x = data.fst$FST, na.rm = TRUE)
max.fst <- max(x = data.fst$FST, na.rm = TRUE)
Expand Down Expand Up @@ -1606,28 +1600,31 @@ heatmap_fst <- function(
heatmap.fst <- ggplot2::ggplot(
data = data.fst, ggplot2::aes(x = POP1, y = POP2, fill = FST)
) +
ggplot2::geom_tile(color = "white") +
ggplot2::geom_tile(color = "white", alpha = 0.7) +
ggplot2::geom_text(ggplot2::aes(x = POP1, y = POP2, label = CI),
color = "black", size = text.size, na.rm = TRUE) +
ggplot2::scale_fill_gradient2(
low = color.low,
mid = color.mid,
high = color.high,
# midpoint = median.fst,
midpoint = mean.fst,
na.value = "white",
limit = NULL,
# limit = c(min.fst, max.fst),
space = "Lab"
) +
ggplot2::scale_fill_viridis_c(name = "Fst", option = "H") +
# ggplot2::scale_fill_gradient2(
# low = color.low,
# mid = color.mid,
# high = color.high,
# # midpoint = median.fst,
# midpoint = mean.fst,
# na.value = "white",
# limit = NULL,
# # limit = c(min.fst, max.fst),
# space = "Lab"
# ) +
ggplot2::scale_x_discrete(position = "top") +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.title.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_blank(),
axis.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold")
)
) +
ggplot2::coord_equal()

print(heatmap.fst)

if (!is.null(filename)) {
Expand Down Expand Up @@ -1750,6 +1747,7 @@ assigner_fst_stats <- function(
#' @keywords internal

fst_stats <- function(x, l, digits = 9L, m = NULL, s = NULL, subsample = FALSE) {
# x = "pairwise.fst" # test
res <- list()
# message(x)
want <- c("sigma.loc", "fst.markers", "fst.ranked", "fst.overall",
Expand Down Expand Up @@ -1868,26 +1866,41 @@ change_matrix_strata <- function(x, s) {
# x
# class(colnames(x))
# class(rownames(x))

if (length(dim(x)) > 1) {
colnames(x) %<>%
as.character(.) %>%
stringi::stri_replace_all_regex(
str = .,
pattern = paste0("^", as.character(s$STRATA_SEQ)),
replacement = as.character(s$POP_ID),
vectorize_all = FALSE
)
# colnames(x)
rownames(x) %<>%
as.character(.) %>%
stringi::stri_replace_all_regex(
str = .,
pattern = paste0("^", as.character(s$STRATA_SEQ)),
replacement = as.character(s$POP_ID),
vectorize_all = FALSE
)
if (identical(colnames(x), as.character(s$STRATA_SEQ))) {
colnames(x) <- as.character(s$POP_ID)
} else {
rlang::abort(message = "Contact author, problem with strata levels in fst")
}
if (identical(rownames(x), as.character(s$STRATA_SEQ))) {
rownames(x) <- as.character(s$POP_ID)
} else {
rlang::abort(message = "Contact author, problem with strata levels in fst")
}
}
#

# problem with code below 20221026
# if (length(dim(x)) > 1) {
# colnames(x) %<>%
# as.character(.) %>%
# stringi::stri_replace_all_regex(
# str = .,
# pattern = paste0("^", as.character(s$STRATA_SEQ)),
# replacement = as.character(s$POP_ID),
# vectorize_all = FALSE
# )
# # colnames(x)
# rownames(x) %<>%
# as.character(.) %>%
# stringi::stri_replace_all_regex(
# str = .,
# pattern = paste0("^", as.character(s$STRATA_SEQ)),
# replacement = as.character(s$POP_ID),
# vectorize_all = FALSE
# )
# }

return(x)
}#change_matrix_strata

Expand Down
3 changes: 2 additions & 1 deletion R/global_variables.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ if (getRversion() >= "2.15.1") {
"whitelist.markers", "random.seed", "folder", ".data", "MARKERS...1",
"MARKERS...3", "MARKERS...4", "STRATA_SEQ", "ARGUMENTS", "VALUES",
"input", "HET", "calibrate.alleles", "iteration.subsample", "n",
"M_SEQ", "ID_SEQ", "1", "2", "where", "p", "forking"
"M_SEQ", "ID_SEQ", "1", "2", "where", "p", "forking", "strata.bk",
"markers.meta.bk", "genotypes.meta.bk"
)
)
}
Loading

0 comments on commit 2eb1c49

Please sign in to comment.