Zettelkasten

library(tidyverse)



start.dir <- "results.gsea"
out.dir <- "results.gsea.processed"


out.dir.norm <- file.path(out.dir, "all")
out.dir.fsn <-  file.path(out.dir, "fdr.sign.neg")


dirs <- list.dirs(start.dir, recursive=FALSE)
for (dir in dirs) {

  mirna.pre <- stringr::str_replace( dir, pattern="_star", replacement="-star")
  mirna.pre <- stringr::str_replace( mirna.pre, pattern="_3p", replacement="-3p")
  mirna.pre <- stringr::str_replace( mirna.pre, pattern="_5p", replacement="-5p")
  mirna <- paste0(
    "hsa-miR-",
    stringr::str_remove_all(
      mirna.pre,
      pattern=paste0(start.dir, "|/|_st|\\.fdr_sig_neg|.GseaPreranked.[0-9]+")
    )
  )
  # if (is.na(mirna)) {
  #   stop("error!")
  # }

  is.fsn <- stringr::str_detect(dir, pattern=".fdr_sig_neg")
  if (is.fsn) {
    dest.dir <- out.dir.fsn
  } else {
    dest.dir <- out.dir.norm
  }


  tsv.files <- list.files(dir, pattern="*.tsv")

  if (length(tsv.files) == 0) {
    next
  }

  pos.index <- which(stringr::str_starts(
    string = tsv.files,
    pattern = "gsea_report_for_na_pos"
  ))
  pos.file <- tsv.files[pos.index]

  neg.index <- which(stringr::str_starts(
    string = tsv.files,
    pattern = "gsea_report_for_na_neg"
  ))
  neg.file <- tsv.files[neg.index]

  ranked_gene_list.index <- which(stringr::str_starts(
    string = tsv.files,
    pattern = "ranked_gene_list_na"
  ))

  gene.set.sizes.file <- which(tsv.files == "gene_set_sizes.tsv")
  pathway.files <- tsv.files[-c(
    pos.index,
    neg.index,
    gene.set.sizes.file,
    ranked_gene_list.index
  )]

  files <- data.frame(
    origin = c(
      file.path(dir, pos.file),
      file.path(dir, neg.file),
      file.path(dir, pathway.files)
    ),
    dest = c(
      file.path(dest.dir, "pos", paste0(mirna, ".tsv")),
      file.path(dest.dir, "neg", paste0(mirna, ".tsv")),
      file.path(
        dest.dir,
        "pathway.genes",
        mirna,
        stringr::str_replace(
          pathway.files,
          pattern=".tsv",
          replacement=".txt"
        )
      )
    )
  )

  dir.create(file.path(dest.dir, "pos"), recursive=TRUE)
  dir.create(file.path(dest.dir, "neg"), recursive=TRUE)
  dir.create(file.path(dest.dir, "pathway.genes", mirna), recursive=TRUE)

  for (i in 1:nrow(files)) {
    if (file.exists(files[i, "origin"])) {
      file.copy(files[i, "origin"], files[i, "dest"])
    }
  }
}


# step 2: make combined files of all tsv files per directory
dirs <- list.dirs(out.dir, recursive=TRUE)
for (dir in dirs) {
  tsv.files <- list.files(dir, pattern="*.tsv")
  if (length(tsv.files) <= 0) {
    next
  }

  combined.table <- data.frame(
    NAME = character(),
    SIZE = double(),
    ES = numeric(),
    NES = numeric(),
    NOM.p.val = numeric(),
    FDR.q.val = numeric(),
    FWER.p.val = numeric(),
    RANK.AT.MAX = numeric(),
    LEADING.EDGE = character(),
    file.name = character(),
    mirna.n = numeric()
  )

  for (tsv.file in tsv.files) {
    file.contents <- readr::read_tsv(file.path(dir, tsv.file)) %>%
      dplyr::mutate(
        file.name = tsv.file,
        mirna.n = dplyr::n()
      )
    colnames(file.contents) <- make.names(colnames(file.contents))

    combined.table <- rbind(
      combined.table,
      file.contents %>%
        dplyr::select(
          NAME,
          SIZE,
          ES,
          NES,
          NOM.p.val,
          FDR.q.val,
          FWER.p.val,
          RANK.AT.MAX,
          LEADING.EDGE,
          file.name,
          mirna.n
        )
    )

    unlink(file.path(dir, tsv.file))
  }

  combined.table %>%
    dplyr::group_by(NAME) %>%
    dplyr::mutate(
      name.group.n = dplyr::n(),
      mirna = str_remove(file.name, pattern=".tsv")
    ) %>%
    readr::write_csv(
      file.path(dir, "all.csv")
    ) %>%
    dplyr::filter(
      FWER.p.val <= 0.05
    ) %>%
    dplyr::mutate(
      name.group.n = dplyr::n()
    ) %>%
    readr::write_csv(
      file.path(dir, "all.sign.csv")
    )
}


# step 3: make combined files of all txt files per miRNA-directory
gene.dirs <- character()
dirs <- list.dirs(out.dir, recursive=TRUE)
for (dir in dirs) {
  txt.files <- list.files(dir, pattern="*.txt")
  if (length(txt.files) <= 0) {
    next
  }

  combined.table <- data.frame(
    miRNA = character(),
    pathway = character(),
    pathway.n = numeric(),
    SYMBOL = character(),
    RANK.IN.GENE.LIST = numeric(),
    RANK.METRIC.SCORE = numeric(),
    RUNNING.ES = numeric(),
    CORE.ENRICHMENT = character(),
    dir = character(),
    file.name = character()
  )

  for (txt.file in txt.files) {
    file.contents <- readr::read_tsv(file.path(dir, txt.file)) %>%
      dplyr::mutate(
        miRNA = stringr::str_remove(dir, pattern="[a-zA-Z0-9 ./-]+/"),
        pathway = stringr::str_remove(txt.file, pattern=".txt"),
        pathway.n = dplyr::n(),
        dir = dir,
        file.name = txt.file,
      )
    colnames(file.contents) <- make.names(colnames(file.contents))

    combined.table <- rbind(
      combined.table,
      file.contents %>%
        dplyr::select(
          miRNA,
          pathway,
          SYMBOL,
          RANK.IN.GENE.LIST,
          RANK.METRIC.SCORE,
          RUNNING.ES,
          CORE.ENRICHMENT,
          pathway.n,
          dir,
          file.name
        )
    )
  }

  file.name <- basename(paste0(dir, ".csv"))
  base.dir <- dirname(paste0(dir, ".csv"))
  all.file <- file.path(base.dir, "all", file.name)
  enriched.file <- file.path(base.dir, "enriched", file.name)

  dir.create(file.path(base.dir, "all"), recursive=TRUE)
  dir.create(file.path(base.dir, "enriched"), recursive=TRUE)

  gene.dirs <- unique(c(
    gene.dirs,
    file.path(base.dir, "all"),
    file.path(base.dir, "enriched")
  ))

  combined.table %>%
    dplyr::select(
      -tidyselect::starts_with("X")
    ) %>%
    readr::write_csv(all.file) %>%
    dplyr::filter(
      CORE.ENRICHMENT == "Yes"
    ) %>%
    readr::write_csv(enriched.file)

  unlink(dir, recursive=TRUE)
}

# step 4: make combined files of all pathway genes files per directory
for (dir in gene.dirs) {
  csv.files <- list.files(dir, pattern="*.csv")
  if (length(csv.files) <= 0) {
    next
  }

  combined.table <- data.frame(
    miRNA = character(),
    pathway = character(),
    pathway.n = numeric(),
    SYMBOL = character(),
    RANK.IN.GENE.LIST = numeric(),
    RANK.METRIC.SCORE = numeric(),
    RUNNING.ES = numeric(),
    CORE.ENRICHMENT = character(),
    dir = character(),
    file.name = character()
  )

  for (csv.file in csv.files) {
    file.contents <- readr::read_csv(file.path(dir, csv.file))

    combined.table <- rbind(
      combined.table,
      file.contents
    )
  }

  file.name <- basename(paste0(dir, ".csv"))
  base.dir <- dirname(paste0(dir, ".csv"))
  
  combined.table %>%
    readr::write_csv(
      file.path(base.dir, file.name)
    )

  unlink(dir, recursive=TRUE)
}