# In tests/testthat/test-data_preprocessing.R

test_that("detrending and retrending works as expected", {
  # Create a mock data.table
  params <- list(
    target = "NO2",
    meteo_variables = c("TMP"),
    lightgbm = list(
      nrounds = 5,
      eta = 0.1,
      num_leaves = 8
    )
  )
  env_data1 <- data.table::data.table(
    date = as.POSIXct(c(
      "2021-01-01 00:00:00", "2021-03-01 00:00:00",
      "2021-06-01 00:00:00", "2022-01-01 01:00:00",
      "2023-01-01 02:00:00"
    )),
    Station = "TEST001",
    part = "test",
    Komponente = "TMP",
    Komponente_txt = "Temperature",
    Wert = c(10, 15, 20, 20, 30)
  )
  env_data2 <- data.table::data.table(
    date = as.POSIXct(c(
      "2021-01-01 00:00:00", "2021-03-01 00:00:00",
      "2021-06-01 00:00:00", "2022-01-01 01:00:00",
      "2023-01-01 02:00:00"
    )),
    Station = "TEST001",
    part = "test",
    Komponente = "NO2",
    Komponente_txt = "Stickstoffdioxid",
    Wert = c(10, 20, 30, 40, 50)
  )
  env_data <- rbind(env_data1, env_data2)
  meteo_available <- c("TMP")
  # Run clean_data function with daily aggregation
  cleaned_data <- clean_data(env_data, "TEST001",
    aggregate_daily = TRUE
  )
  application_start <- lubridate::ymd("20211201")
  application_end <- lubridate::ymd("20220901")
  dt_prepared <- prepare_data_for_modelling(env_data, params)
  split_data <- split_data_counterfactual(
    dt_prepared,
    application_start,
    application_end
  )
  detrended_data <- detrend(split_data)

  expect_true(class(split_data) == "list")
  expect_equal(nrow(split_data$apply), 1)
  expect_s3_class(detrended_data$train, "data.table")
  expect_s3_class(detrended_data$apply, "data.table")
  # Retrending reverses effect of detrending
  expect_lt(sum(detrended_data$train$value), 1e-8)
})

train_data <- data.frame(
  var1 = c(1, 2, 3, 4, 5),
  var2 = c(5, 6, 7, 8, 9),
  var3 = c("A", "B", "C", "D", "E") # Non-numeric column
)

apply_data <- data.frame(
  var1 = c(6, 7, 8),
  var2 = c(10, 11, 12),
  var3 = c("F", "G", "H") # Non-numeric column
)

test_that("scale_data works with scaling", {
  result <- scale_data(train_data, apply_data)

  # Manually calculate means and standard deviations
  means <- colMeans(train_data %>% select(where(is.numeric)))
  sds <- apply(train_data %>% select(where(is.numeric)), 2, sd)

  # Manually scale train_data for comparison
  train_scaled_var1 <- (train_data$var1 - means["var1"]) / sds["var1"]
  train_scaled_var2 <- (train_data$var2 - means["var2"]) / sds["var2"]

  # Check if the train data has been scaled correctly
  expect_equal(round(result$train$var1, 2), round(train_scaled_var1, 2))
  expect_equal(round(result$train$var2, 2), round(train_scaled_var2, 2))

  # Check if apply data has been scaled using the means and sds of train data
  apply_scaled_var1 <- (apply_data$var1 - means["var1"]) / sds["var1"]
  apply_scaled_var2 <- (apply_data$var2 - means["var2"]) / sds["var2"]

  expect_equal(round(result$apply$var1, 2), round(apply_scaled_var1, 2))
  expect_equal(round(result$apply$var2, 2), round(apply_scaled_var2, 2))

  # Check if the non-numeric column is not affected
  expect_equal(result$train$var3, train_data$var3)
  expect_equal(result$apply$var3, apply_data$var3)

  # Check that means and sds are returned correctly
  expect_equal(result$means, means)
  expect_equal(result$sds, sds)
})

# test rescale predictions
# mock data
scale_result <- scale_data(train_data, apply_data)

# Create standardized predictions for testing rescale_predictions
dt_predictions <- data.frame(
  prediction = c(-1.2649111, 0, 1.2649111), # Standardized predictions
  prediction_lower = c(-1.5, 0, 1.5), # Standardized min predictions
  prediction_upper = c(-1.0, 0, 1.0), # Standardized max predictions
  var1 = scale_result$apply$var1, # Standardized var1 from apply_data
  var2 = scale_result$apply$var2 # Standardized var2 from apply_data
)

test_that("rescale_predictions rescales correctly using scaling parameters", {
  # Use the rescale_predictions function
  rescaled <- rescale_predictions(scale_result, dt_predictions)

  # Manually rescale the predictions using the means and sds from scale_result
  means <- scale_result$means
  sds <- scale_result$sds

  expected_rescaled <- dt_predictions %>%
    mutate(
      prediction = prediction * sds["value"] + means["value"],
      prediction_lower = prediction_lower * sds["value"] + means["value"],
      prediction_upper = prediction_upper * sds["value"] + means["value"]
    )

  # Compare rescaled predictions with dynamically calculated expected values
  expect_equal(
    round(rescaled$prediction, 6),
    round(expected_rescaled$prediction, 6)
  )
  expect_equal(
    round(rescaled$prediction_lower, 6),
    round(expected_rescaled$prediction_lower, 6)
  )
  expect_equal(
    round(rescaled$prediction_upper, 6),
    round(expected_rescaled$prediction_upper, 6)
  )
})
