% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/medic.R
\name{medic}
\alias{medic}
\alias{print.medic}
\title{Medication clustering (based on ATC and timing)}
\usage{
medic(
  data,
  k = 5,
  id,
  atc,
  timing,
  base_clustering,
  linkage = "complete",
  summation_method = "sum_of_minima",
  alpha = 1,
  beta = 1,
  gamma = 1,
  p = 1,
  theta = (5:0)/5,
  parallel = FALSE,
  return_distance_matrix = FALSE,
  set_seed = FALSE,
  ...
)

\method{print}{medic}(x, ...)
}
\arguments{
\item{data}{A data frame containing all the variables for the clustering.}

\item{k}{a vector specifying the number of clusters to identify.}

\item{id}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An unquoted
expression naming the variable in \code{data} describing person id.}

\item{atc}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An unquoted
expression naming the variable in \code{data} containing ATC codes.}

\item{timing}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An unquoted
expression naming the variable or variables in \code{data} describing
medication timing. Variable names can be used as if they were positions in
the data frame, so expressions like x:y can be used to select a range of
variables. Moreover, pattern matching selection helpers such as
\code{\link[tidyselect:starts_with]{starts_with}} or
\code{\link[tidyselect:starts_with]{num_range}} may also be used to select timing
variables.}

\item{base_clustering}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An
unquoted expression naming the variable in \code{data} that gives an initial
clustering to start the \code{medic} from or \code{NULL}.}

\item{linkage}{The agglomeration method to be used in the clustering. This
should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2",
"single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median"
(= WPGMC) or "centroid" (= UPGMC). See \link[stats:hclust]{stats::hclust} for more
information. For a discussion of linkage criterion choice see
\emph{details} below.}

\item{summation_method}{The summation method used in the distance measure.
This  should be either "double_sum" or "sum_of_minima". See
\emph{details} below for more information.}

\item{alpha}{A number giving the tuning of the normalization. See
\emph{details} below for more information.}

\item{beta}{A number giving the power of the individual medication
combinations. See \emph{details} below for more information.}

\item{gamma}{A number giving the weight of the timing terms. See
\emph{details} below for more information.}

\item{p}{The power of the Minkowski distance used in the timing-specific
distance. See \emph{details} below for more information.}

\item{theta}{A vector of length 6 specifying the tuning of the ATC measure.
See \emph{details} below for more information.}

\item{parallel}{A logical or an integer. If \code{FALSE}, the default, no
parallelization is done.

If \code{TRUE} or an integer larger than 2L parallelization is implemented via
\link[parallel:clusterApply]{parLapply} from the \strong{parallel} package. When
\code{parallel} is \code{TRUE} the number of \link[parallel:makeCluster]{clusters}
is set to \link[parallel:detectCores]{detectCores} - 1, and when \code{parallel} is
an integer then the number of \link[parallel:makeCluster]{clusters} is set to
\code{parallel}. For more details on the parallelization method see
\link[parallel:clusterApply]{parallel::parLapply}.}

\item{return_distance_matrix}{A logical.}

\item{set_seed}{A logical or an integer.}

\item{...}{Additional arguments not currently in use.}

\item{x}{A \code{medic} object for printing.}
}
\value{
An object of class \emph{medic} which describes the clusters produced
the hierarchical clustering process. The object is a list with components:
\describe{
\item{data}{the inputted data frame \code{data} with the cluster
assignments appended at the end.}
\item{clustering}{a data frame with the person id as given by \code{id},
the \code{.analysis_order} and the clusters found.}
\item{variables}{a list of the variables used in the clustering.}
\item{parameters}{a data frame with all the inputted clustering
parameters and the corresponding method names. These method names
correspond to the column names for each cluster in the \code{clustering}
data frame described right above.}
\item{key}{a list of keys used internally in the function to keep track of
simplified versions of the data.}
\item{distance_matrix}{the distance matrices for each method if
\code{return_distance_matrix} is \code{TRUE} otherwise \code{NULL}.}
\item{call}{the matched call.}
}
}
\description{
The \code{medic} method uses agglomerative hierarchical clustering with a
bespoke distance measure based on medication ATC codes similarities,
medication timing and medication amount or dosage.
}
\details{
The \code{medic} method uses agglomerative hierarchical
clustering with a bespoke distance measure based on medication ATC codes and
timing similarities to assign medication pattern clusters to people.

Two versions of the distance measure are available:

The \emph{double sum}:

\deqn{%
  d(p_i, p_j) = N_{\alpha}(M_i \times M_j) \sum_{m\in M_i}\sum_{n \in M_j}%
  ((1 + D_{\theta}(m,n)) (1 + \gamma T_p(t_{im},t_{jn})) - 1)^{\beta}.%
}{%
  d(p_i, p_j) = N_\alpha(M_i  M_j) \sum{m in M_i}\sum{n in M_j}%
  ((1 + D_\theta(m,n)) (1+ \gamma T_p(t_{im},t_{jn})) - 1)^\beta.%
}

and the \emph{sum of minima}:
\deqn{%
  d(p_i, p_j) = \frac{1}{2}(N_{\alpha}(M_i)\sum_{m\in M_i}\min_{n \in M_j}%
  ((1 + D_{\theta}(m,n)) (1 + \gamma T_p(t_{im},t_{jn})) - 1)^{\beta} +
  N_{\alpha}(M_j) \sum_{n\in M_j}\min_{m \in M_i}%
  ((1 + D_{\theta}(m,n)) (1 + \gamma T_p(t_{im},t_{jn})) - 1)^{\beta}).%
}{%
  d(p_i, p_j) = (1/2) *( N_\alpha(M_i)\sum{m in M_i}\min{n in M_j}%
  ((1 + D_\theta(m,n)) (1+ \gamma T_p(t_{im},t_{jn})) - 1)^\beta +
   (N_\alpha(M_j)\sum{n in M_j}\min{m in M_i}%
  ((1 + D_\theta(m,n)) (1+ \gamma T_p(t_{im},t_{jn})) - 1)^\beta).%
}
\subsection{Normalization}{

\deqn{%
  N_{\alpha}(x) = |x|^{-\alpha}%
}{%
  N_\alpha(x) = |x|^-\alpha %
}

If the normalization tuning, \code{alpha}, is 0, then no normalization is
preformed and the distance measure becomes highly dependent on the number of
distinct medications given. That is, people using more medication will have
larger distances to others. If the normalization tuning, \code{alpha}, is 1 -
the default - then the summation is normalized with the number of terms in
the sum, in other words, the average is calculated.
}

\subsection{ATC distance}{

The central idea of this method, namely the ATC distance, is given as
\deqn{%
  D_{\theta}(x, y) = \sum_{i=1,...,5}1\{x and y match on level i, but not level i + 1\}\theta_i%
}{%
  D_\theta(x, y) = \sum_{i=1,...,5}1\{x and y match on level i, but not level i + 1\}\theta_i%
}
The ATC distance is tuned using the vector \code{theta}.

Note that two ATC codes are said to match at level i when they are identical
at level i. E.g. the two codes N06AB01 and N06AA01 match on level 1, 2, and 3
as they are both "N" at level 1, "N06" at level 2, and "N06A" at level 3,
but at level 4 they differ ("N06AB" and "N06AA" are not the same).
}

\subsection{Timing distance}{

The timing distance is a simple Minkowski distance:
\deqn{%
  T(x,y) =(\sum_{t \in T} |x_t - y_t|^p)^{1/p}.%
}{%
  T(x,y) =(\sum_{t in T} |x_t - y_t|^p)^{1/p}.%
}
When \code{p} is 1, the default, the Manhattan distance is used.
}
}
\section{Methods (by generic)}{
\itemize{
\item \code{print(medic)}: Print method for medic-objects

}}
\examples{
# A simple clustering based only on ATC
clust <- medic(complications, id = id, atc = atc, k = 3)

# A simple clustering with both ATC and timing
clust <- medic(
  complications,
  id = id,
  atc = atc,
  timing = first_trimester:third_trimester,
  k = 3
)


}
\seealso{
\link{summary.medic} for summaries and plots.

\link{employ} for employing an existing clustering to new data.

\link{enrich} for enriching the meta data in the \code{medic} object with additional
data.
}
