library(tidyverse)
start.dir <- "results.gsea"
out.dir <- "results.gsea.processed"
out.dir.norm <- file.path(out.dir, "all")
out.dir.fsn <- file.path(out.dir, "fdr.sign.neg")
dirs <- list.dirs(start.dir, recursive=FALSE)
for (dir in dirs) {
mirna.pre <- stringr::str_replace( dir, pattern="_star", replacement="-star")
mirna.pre <- stringr::str_replace( mirna.pre, pattern="_3p", replacement="-3p")
mirna.pre <- stringr::str_replace( mirna.pre, pattern="_5p", replacement="-5p")
mirna <- paste0(
"hsa-miR-",
stringr::str_remove_all(
mirna.pre,
pattern=paste0(start.dir, "|/|_st|\\.fdr_sig_neg|.GseaPreranked.[0-9]+")
)
)
# if (is.na(mirna)) {
# stop("error!")
# }
is.fsn <- stringr::str_detect(dir, pattern=".fdr_sig_neg")
if (is.fsn) {
dest.dir <- out.dir.fsn
} else {
dest.dir <- out.dir.norm
}
tsv.files <- list.files(dir, pattern="*.tsv")
if (length(tsv.files) == 0) {
next
}
pos.index <- which(stringr::str_starts(
string = tsv.files,
pattern = "gsea_report_for_na_pos"
))
pos.file <- tsv.files[pos.index]
neg.index <- which(stringr::str_starts(
string = tsv.files,
pattern = "gsea_report_for_na_neg"
))
neg.file <- tsv.files[neg.index]
ranked_gene_list.index <- which(stringr::str_starts(
string = tsv.files,
pattern = "ranked_gene_list_na"
))
gene.set.sizes.file <- which(tsv.files == "gene_set_sizes.tsv")
pathway.files <- tsv.files[-c(
pos.index,
neg.index,
gene.set.sizes.file,
ranked_gene_list.index
)]
files <- data.frame(
origin = c(
file.path(dir, pos.file),
file.path(dir, neg.file),
file.path(dir, pathway.files)
),
dest = c(
file.path(dest.dir, "pos", paste0(mirna, ".tsv")),
file.path(dest.dir, "neg", paste0(mirna, ".tsv")),
file.path(
dest.dir,
"pathway.genes",
mirna,
stringr::str_replace(
pathway.files,
pattern=".tsv",
replacement=".txt"
)
)
)
)
dir.create(file.path(dest.dir, "pos"), recursive=TRUE)
dir.create(file.path(dest.dir, "neg"), recursive=TRUE)
dir.create(file.path(dest.dir, "pathway.genes", mirna), recursive=TRUE)
for (i in 1:nrow(files)) {
if (file.exists(files[i, "origin"])) {
file.copy(files[i, "origin"], files[i, "dest"])
}
}
}
# step 2: make combined files of all tsv files per directory
dirs <- list.dirs(out.dir, recursive=TRUE)
for (dir in dirs) {
tsv.files <- list.files(dir, pattern="*.tsv")
if (length(tsv.files) <= 0) {
next
}
combined.table <- data.frame(
NAME = character(),
SIZE = double(),
ES = numeric(),
NES = numeric(),
NOM.p.val = numeric(),
FDR.q.val = numeric(),
FWER.p.val = numeric(),
RANK.AT.MAX = numeric(),
LEADING.EDGE = character(),
file.name = character(),
mirna.n = numeric()
)
for (tsv.file in tsv.files) {
file.contents <- readr::read_tsv(file.path(dir, tsv.file)) %>%
dplyr::mutate(
file.name = tsv.file,
mirna.n = dplyr::n()
)
colnames(file.contents) <- make.names(colnames(file.contents))
combined.table <- rbind(
combined.table,
file.contents %>%
dplyr::select(
NAME,
SIZE,
ES,
NES,
NOM.p.val,
FDR.q.val,
FWER.p.val,
RANK.AT.MAX,
LEADING.EDGE,
file.name,
mirna.n
)
)
unlink(file.path(dir, tsv.file))
}
combined.table %>%
dplyr::group_by(NAME) %>%
dplyr::mutate(
name.group.n = dplyr::n(),
mirna = str_remove(file.name, pattern=".tsv")
) %>%
readr::write_csv(
file.path(dir, "all.csv")
) %>%
dplyr::filter(
FWER.p.val <= 0.05
) %>%
dplyr::mutate(
name.group.n = dplyr::n()
) %>%
readr::write_csv(
file.path(dir, "all.sign.csv")
)
}
# step 3: make combined files of all txt files per miRNA-directory
gene.dirs <- character()
dirs <- list.dirs(out.dir, recursive=TRUE)
for (dir in dirs) {
txt.files <- list.files(dir, pattern="*.txt")
if (length(txt.files) <= 0) {
next
}
combined.table <- data.frame(
miRNA = character(),
pathway = character(),
pathway.n = numeric(),
SYMBOL = character(),
RANK.IN.GENE.LIST = numeric(),
RANK.METRIC.SCORE = numeric(),
RUNNING.ES = numeric(),
CORE.ENRICHMENT = character(),
dir = character(),
file.name = character()
)
for (txt.file in txt.files) {
file.contents <- readr::read_tsv(file.path(dir, txt.file)) %>%
dplyr::mutate(
miRNA = stringr::str_remove(dir, pattern="[a-zA-Z0-9 ./-]+/"),
pathway = stringr::str_remove(txt.file, pattern=".txt"),
pathway.n = dplyr::n(),
dir = dir,
file.name = txt.file,
)
colnames(file.contents) <- make.names(colnames(file.contents))
combined.table <- rbind(
combined.table,
file.contents %>%
dplyr::select(
miRNA,
pathway,
SYMBOL,
RANK.IN.GENE.LIST,
RANK.METRIC.SCORE,
RUNNING.ES,
CORE.ENRICHMENT,
pathway.n,
dir,
file.name
)
)
}
file.name <- basename(paste0(dir, ".csv"))
base.dir <- dirname(paste0(dir, ".csv"))
all.file <- file.path(base.dir, "all", file.name)
enriched.file <- file.path(base.dir, "enriched", file.name)
dir.create(file.path(base.dir, "all"), recursive=TRUE)
dir.create(file.path(base.dir, "enriched"), recursive=TRUE)
gene.dirs <- unique(c(
gene.dirs,
file.path(base.dir, "all"),
file.path(base.dir, "enriched")
))
combined.table %>%
dplyr::select(
-tidyselect::starts_with("X")
) %>%
readr::write_csv(all.file) %>%
dplyr::filter(
CORE.ENRICHMENT == "Yes"
) %>%
readr::write_csv(enriched.file)
unlink(dir, recursive=TRUE)
}
# step 4: make combined files of all pathway genes files per directory
for (dir in gene.dirs) {
csv.files <- list.files(dir, pattern="*.csv")
if (length(csv.files) <= 0) {
next
}
combined.table <- data.frame(
miRNA = character(),
pathway = character(),
pathway.n = numeric(),
SYMBOL = character(),
RANK.IN.GENE.LIST = numeric(),
RANK.METRIC.SCORE = numeric(),
RUNNING.ES = numeric(),
CORE.ENRICHMENT = character(),
dir = character(),
file.name = character()
)
for (csv.file in csv.files) {
file.contents <- readr::read_csv(file.path(dir, csv.file))
combined.table <- rbind(
combined.table,
file.contents
)
}
file.name <- basename(paste0(dir, ".csv"))
base.dir <- dirname(paste0(dir, ".csv"))
combined.table %>%
readr::write_csv(
file.path(base.dir, file.name)
)
unlink(dir, recursive=TRUE)
}