#' Process MEGA CSV Files
#'
#' Reads and processes MEGA-generated CSV files from a specified directory,
#' extracts codon usage data, and returns a consolidated dataset.
#'
#' @param folder_path Path to directory containing MEGA CSV files. Files should follow
#'   MEGA format with columns for Codon, Count, and RSCU values.
#'
#' @return A data frame containing:
#' \describe{
#'   \item{AA}{Amino acid abbreviation}
#'   \item{Codon}{DNA codon sequence}
#'   \item{Fill}{Position index within amino acid group}
#'   \item{Species}{Species identifier from file name}
#'   \item{RSCU}{Calculated Relative Synonymous Codon Usage value}
#' }
#'
#' @details
#' Processes files through these steps:
#' \enumerate{
#'   \item Identifies valid header lines containing "Codon", "Count", and "RSCU"
#'   \item Extracts codon/amino acid data below headers
#'   \item Calculates position indices (Fill) for synonymous codons
#'   \item Combines all species data into a single data frame
#' }
#'
#' @examples
#' # Using example data
#' example_dir <- system.file("extdata", "mega", package = "ggmRSCU")
#' result <- read_mega(example_dir)
#' head(result)
#'
#' @export
read_mega <- function(folder_path) {
  requireNamespace("dplyr")
  requireNamespace("tidyr")
  requireNamespace("purrr")

  file_list <- list.files(folder_path,
                          pattern = "\\.csv$",
                          full.names = TRUE)

  if (length(file_list) == 0) {
    stop("No CSV files found in: ", folder_path)
  }

  df_list <- process_files(file_list)

  combined_data <- purrr::imap_dfr(df_list, function(data, species) {
    data %>%
      dplyr::mutate(Species = species) %>%
      dplyr::select(AA, Codon, Fill, Species, RSCU)
  })

  return(combined_data)
}

process_files <- function(file_list) {
  df_list <- list()

  for (file_path in file_list) {
    raw_lines <- readLines(file_path)
    header_line <- find_header(raw_lines, file_path)
    if (is.null(header_line)) next

    data_chunk <- extract_data(raw_lines, header_line, file_path)
    if (is.null(data_chunk)) next

    processed <- process_data(data_chunk, file_path)
    if (!is.null(processed)) {
      species_id <- tools::file_path_sans_ext(basename(file_path))
      df_list[[species_id]] <- processed
    }
  }

  if (length(df_list) == 0) {
    stop("No processable data found in any files")
  }

  return(df_list)
}

find_header <- function(raw_lines, file_path) {
  for (i in seq_along(raw_lines)) {
    cells <- trimws(unlist(strsplit(raw_lines[i], ",")))
    if (length(cells) >= 3 &&
        identical(cells[1:3], c("Codon", "Count", "RSCU"))) {
      return(i)
    }
  }
  warning("Skipping ", basename(file_path), ": Required header not found")
  return(NULL)
}

extract_data <- function(raw_lines, header_line, file_path) {
  data_start <- header_line + 1
  if (data_start > length(raw_lines)) {
    warning("Skipping ", basename(file_path), ": Missing data after header")
    return(NULL)
  }

  data_end <- length(raw_lines)
  for (i in data_start:length(raw_lines)) {
    if (trimws(raw_lines[i]) == "") {
      data_end <- i - 1
      break
    }
  }

  data_chunk <- utils::read.csv(
    text = raw_lines[data_start:data_end],
    header = FALSE,
    col.names = strsplit(raw_lines[header_line], ",")[[1]],
    stringsAsFactors = FALSE
  )

  return(stats::na.omit(data_chunk))
}

process_data <- function(data_chunk, file_path) {
  result <- data.frame(
    AA = character(0),
    Codon = character(0),
    RSCU = numeric(0)
  )

  codon_cols <- grep("^Codon(\\.\\d+)?$", names(data_chunk), value = TRUE)
  rscu_cols <- grep("^RSCU(\\.\\d+)?$", names(data_chunk), value = TRUE)

  if (!identical(length(codon_cols), length(rscu_cols)) ||
      length(codon_cols) == 0) {
    warning("Skipping ", basename(file_path), ": Column structure mismatch")
    return(NULL)
  }

  for (col_idx in seq_along(codon_cols)) {
    for (i in seq_len(nrow(data_chunk))) {
      aa <- sub(".*\\((.*)\\)", "\\1", data_chunk[[codon_cols[col_idx]]][i])
      codon <- sub("\\(.*", "", data_chunk[[codon_cols[col_idx]]][i])
      rscu_val <- data_chunk[[rscu_cols[col_idx]]][i]

      result <- rbind(result, data.frame(AA = aa, Codon = codon, RSCU = rscu_val))
    }
  }

  if (nrow(result) == 0) return(NULL)

  result$Fill <- with(result, ave(seq_along(AA), AA, FUN = seq_along))
  result <- subset(result, !AA %in% c("*", "Stop"))

  return(result)
}
