% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/PlackettLuce.R
\name{PlackettLuce}
\alias{PlackettLuce}
\title{Fit a Plackett-Luce Model}
\usage{
PlackettLuce(
  rankings,
  npseudo = 0.5,
  normal = NULL,
  gamma = NULL,
  adherence = NULL,
  weights = freq(rankings),
  na.action = getOption("na.action"),
  start = NULL,
  method = c("iterative scaling", "BFGS", "L-BFGS"),
  epsilon = 1e-07,
  steffensen = 0.1,
  maxit = c(500, 10),
  trace = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{rankings}{a \code{"\link{rankings}"} object, or an object that can be
coerced by \code{as.rankings}.  An \code{\link[=aggregate]{"aggregated_rankings"}}
object can be used to specify rankings and weights simultaneously.
A \code{"\link{grouped_rankings}"} object should be used when estimating
adherence for rankers with multiple rankings per ranker.}

\item{npseudo}{when using pseudodata: the number of wins and losses to add
between each object and a hypothetical reference object.}

\item{normal}{a optional list with elements named \code{mu} and \code{Sigma}
specifying the mean and covariance matrix of a multivariate normal prior on
the \emph{log} worths.}

\item{gamma}{a optional list with elements named \code{shape} and \code{rate}
specifying parameters of a gamma prior on adherence parameters for each
ranker (use \code{grouped_rankings} to group multiple rankings by ranker).
The short-cut \code{TRUE} may be used to specify a Gamma(10, 10) prior. If
\code{NULL} (or \code{FALSE}), adherence is fixed to \code{adherence} for
all rankers.}

\item{adherence}{an optional vector of adherence values for each ranker. If
missing, adherence is fixed to 1 for all rankers. If \code{gamma} is not
\code{NULL}, this specifies the starting values for the adherence.}

\item{weights}{an optional vector of weights for each ranking.}

\item{na.action}{a function to handle any missing rankings, see
\code{\link[=na.omit]{na.omit()}}.}

\item{start}{starting values for the worth parameters and the tie parameters
on the raw scale (worth parameters need not be scaled to sum to 1). If
\code{normal} is specified, \code{exp(normal$mu)} is used as starting values
for the worth parameters. Coefficients from a previous fit can be passed as
the result of a call to  \code{coef.PlackettLuce}, or the \code{coefficients}
element of a \code{"PlackettLuce"} object.}

\item{method}{the method to be used for fitting: \code{"iterative scaling"}
(iterative scaling to sequentially update the parameter values),
\code{"BFGS"} (the BFGS optimisation algorithm through the
\code{\link{optim}} interface), \code{"L-BFGS"} (the limited-memory BFGS
optimisation algorithm as implemented in the \code{\link[lbfgs]{lbfgs}}
package). Iterative scaling is used by default, unless a prior is specified
by \code{normal} or \code{gamma}, in which case the default is \code{"BFGS"}.}

\item{epsilon}{the maximum absolute difference between the observed and
expected sufficient statistics for the ability parameters at convergence.}

\item{steffensen}{a threshold defined as for \code{epsilon} after which to
apply Steffensen acceleration to the iterative scaling updates.}

\item{maxit}{a vector specifying the maximum number of iterations. If
\code{gamma} is \code{NULL}, only the first element is used and specifies the
maximum number of iterations of the algorithm specified by \code{method}. If
\code{gamma} is not \code{NULL}, a second element may be supplied to specify
the maximum number of iterations of an alternating algorithm, where
the adherence parameters are updated alternately with the other parameters.
The default is to use 10 outer iterations.}

\item{trace}{logical, if \code{TRUE} show trace of iterations.}

\item{verbose}{logical, if \code{TRUE} show messages from validity checks on
the rankings.}

\item{...}{additional arguments passed to \code{optim} or \code{lbfgs}.
In particular the convergence tolerance may be adjusted using e.g.
\code{control = list(reltol = 1e-10)}.}
}
\value{
An object of class \code{"PlackettLuce"}, which is a list containing
the following elements:
\item{call}{ The matched call. }
\item{coefficients}{ The model coefficients. }
\item{loglik}{ The maximized log-likelihood. }
\item{null.loglik}{ The maximized log-likelihood for the null model (all
alternatives including ties have equal probability). }
\item{df.residual}{ The residual degrees of freedom. }
\item{df.null}{ The residual degrees of freedom for the null model. }
\item{rank}{ The rank of the model. }
\item{logposterior}{ If a prior was specified, the maximised log posterior.}
\item{gamma}{ If a gamma prior was specified, the list of parameters. }
\item{normal}{ If a normal prior was specified, the list of parameters. }
\item{iter}{ The number of iterations run. }
\item{rankings}{ The rankings passed to \code{rankings}, converted to a
\code{"rankings"} object if necessary. }
\item{weights}{ The weights applied to each ranking in the fitting. }
\item{adherence}{ The fixed or estimated adherence per ranker. }
\item{ranker}{ The ranker index mapping rankings to rankers (the
\code{"index"} attribute of \code{rankings} if specified as a
\code{"grouped_rankings"} object.)}
\item{ties}{ The observed tie orders corresponding to the estimated tie
parameters. }
\item{conv}{ The convergence code: 0 for successful convergence; 1 if reached
\code{maxit} (outer) iterations without convergence; 2 if Steffensen
acceleration cause log-likelihood to increase; negative number if L-BFGS
algorithm failed for other reason.}
}
\description{
Fit a Plackett-Luce model to a set of rankings. The rankings may be partial
(each ranking completely ranks a subset of the items) and include ties of
arbitrary order.
}
\note{
As the maximum tie order increases, the number of possible choices for
each rank increases rapidly, particularly when the total number of items is
high. This means that the model will be slower to fit with higher \eqn{D}.
In addition, due to the current implementation of the \code{vcov()} method,
computation of the standard errors (as by \code{summary()}) can take almost as
long as the model fit and may even become infeasible due to memory limits.
As a rule of thumb, for > 10 items and > 1000 rankings, we recommend
\code{PlackettLuce()} for ties up to order 4. For higher order ties, a
rank-ordered logit model, see \code{\link[ROlogit:rologit]{ROlogit::rologit()}} or
generalized Mallows Model as in \code{\link[BayesMallows:compute_mallows]{BayesMallows::compute_mallows()}} may be
more suitable, as they do not model tied events explicitly.
}
\section{Model definition}{


A single ranking is given by
\deqn{R = \{C_1, C_2, \ldots, C_J\}}{R = {C_1, C_2, \ldots, C_J}}
where the items in set \eqn{C_1} are ranked higher than (better than) the
items in \eqn{C_2}, and so on. If there are multiple objects in set \eqn{C_j}
these items are tied in the ranking.

For a set if items \eqn{S}, let
\deqn{f(S) = \delta_{|S|}
      \left(\prod_{i \in S} \alpha_i \right)^\frac{1}{|S|}}{
      f(S) = delta_{|S|} * (prod_{i in S} alpha_i)^(1/|S|)}
where \eqn{|S|} is the cardinality (size) of the set, \eqn{\delta_n}{delta_n}
is a parameter related to the prevalence of ties of order \eqn{n}
(with \eqn{\delta_1 \equiv 1}), and \eqn{\alpha_i}{alpha_i} is a
parameter representing the worth of item \eqn{i}.
Then under an extension of the Plackett-Luce model allowing ties up to order
\eqn{D}, the probability of the ranking \eqn{R} is given by
\deqn{\prod_{j = 1}^J \frac{f(C_j)}{
      \sum_{k = 1}^{\min(D_j, D)} \sum_{S \in {A_j \choose k}} f(S)}}{
      prod_{j = 1}^J f(C_j)/
      (sum_{k = 1}^{min(D_j, D)} sum_{S in choose(A_j, k)} f(S))}
where \eqn{D_j} is the cardinality of \eqn{A_j}, the set of
alternatives from which \eqn{C_j} is chosen, and
\eqn{A_j \choose k}{choose(A_j, k)} is all the possible choices of \eqn{k}
items from \eqn{A_j}. The value of \eqn{D} can be set to the maximum number
of tied items observed in the data, so that \eqn{\delta_n = 0}{delta_n = 0}
for \eqn{n > D}.

When the worth parameters are constrained to sum to one, they represent the
probability that the corresponding item comes first in a ranking of all
items, given that first place is not tied.

The 2-way tie prevalence parameter \eqn{\delta_2}{delta_2} is related to
the probability that two items \emph{of equal worth} tie for
first place, given that the first place is not a 3-way or higher tie.
Specifically, that probability is
\eqn{\delta_2/(2 + \delta_2)}{delta_2/(2 + delta_2}.

The 3-way and higher tie-prevalence parameters are similarly interpretable,
in terms of tie probabilities among equal-worth items.

When intermediate tie orders are not observed (e.g. ties of order 2
and order 4 are observed, but no ties of order 3), the maximum
likelihood estimate of the corresponding tie prevalence parameters
is zero, so these parameters are excluded from the model.
}

\section{Pseudo-rankings}{


In order for the maximum likelihood estimate of an object's worth to be
defined, the network of rankings must be strongly connected. This means that
in every possible partition of the objects into two nonempty subsets, some
object in the second set is ranked higher than some object in the first set
at least once.

If the network of rankings is not strongly connected then pseudo-rankings
may be used to connect the network. This approach posits a hypothetical
object with log-worth 0 and adds \code{npseudo} wins and \code{npseudo}
losses to the set of rankings.

The parameter \code{npseudo} is the prior strength.  With \code{npseudo = 0}
the MLE is the posterior mode.  As \code{npseudo} approaches
infinity the log-worth estimates all shrink towards 0. The default,
\code{npseudo = 0.5}, is sufficient to connect the network and has a weak
shrinkage effect. Even for networks that are already connected, adding
pseudo-rankings typically reduces both the bias and variance of the
estimators of the worth parameters.
}

\section{Incorporating prior information on log-worths}{


Prior information can be incorporated by using \code{normal} to specify a
multivariate normal prior on the \emph{log}-worths. The log-worths are then
estimated by maximum a posteriori (MAP) estimation. Model summaries
(deviance, AIC, standard errors) are based on the log-likelihood evaluated
at the MAP estimates, resulting in a finite sample bias that should
disappear as the number of rankings increases. Inference based on these
model summaries is valid as long as the prior is considered fixed and not
tuned as part of the model.

Incorporating a prior is an alternative method of penalization, therefore
\code{npseudo} is set to zero when a prior is specified.
}

\section{Incorporating ranker adherence parameters}{


When rankings come from different rankers, the model can be extended to
allow for varying reliability of the rankers, as proposed by Raman and
Joachims (2014). In particular, replacing \eqn{f(S)} by
\deqn{h(S) = \delta_{|S|}
      \left(\prod_{i \in S} \alpha_i \right)^\frac{\eta_g}{|S|}}{
      h(S) = delta_{|S|} * (prod_{i in S} alpha_i)^(eta_g/|S|)}
where \eqn{\eta_g > 0}{eta_g > 0} is the adherence parameter for ranker
\eqn{g}. In the standard model, all rankers are assumed to have equal
reliability, so \eqn{\eta_g = 1}{eta_g = 1} for all rankers.
Higher \eqn{\eta_g = 1}{eta_g = 1} increases the distance between item
worths, giving greater weight' to the ranker's choice. Conversely, lower
\eqn{\eta_g = 1}{eta_g = 1} shrinks the item worths towards equality so the
ranker's choice is less relevant.

The adherence parameters are not estimable by maximum likelihood, since
for given item worths the maximum likelihood estimate of adherence would be
infinity for rankers that give rankings consistent with the items ordered by
worth and zero for all other rankers. Therefore it is essential to include a
prior on the adherence parameters when these are estimated rather than fixed.
Setting \code{gamma = TRUE} specifies the default
\eqn{\Gamma(10,10)}{Gamma(10,10)} prior, which has a mean of
1 and a probability of 0.99 that the adherence is between 0.37 and 2.
Alternative parameters can be specified by a list with elements \code{shape}
and \code{rate}. Setting scale and rate to a common value \eqn{\theta}{theta}
specifies a mean of 1; \eqn{\theta \ge}{theta >=} 2 will give low prior
probability to near-zero adherence; as \eqn{\theta}{theta} increases the
density becomes more concentrated (and more symmetrical) about 1.

Since the number of adherence parameters will typically be large and it is
assumed the worth and tie parameters are of primary interest, the adherence
parameters are not included in model summaries, but are included in the
returned object.
}

\section{Controlling the fit}{


For models without priors, using \code{nspseudo = 0} will use standard
maximum likelihood, if the network is connected (and throw an error
otherwise).

The fitting algorithm is set by the \code{method} argument. The default
method \code{"iterative scaling"} is a slow but reliable approach. In
addition, this has the most control on the accuracy of the final fit, since
convergence is determined by direct comparison of the observed and expected
values of the sufficient statistics for the worth parameters, rather than a
tolerance on change in the log-likelihood.

The \code{"iterative scaling"} algorithm is slow because it is a first order
method (does not use derivatives of the likelihood). From a set of starting
values that are 'close enough' to the final solution, the algorithm can be
accelerated using
\href{https://en.wikipedia.org/wiki/Steffensen's_method}{Steffensen's method}.
\code{PlackettLuce} attempts to apply Steffensen's acceleration when all
differences between the observed and expected values of the sufficient
statistics are less than \code{steffensen}. This is an ad-hoc rule defining
'close enough' and in some cases the acceleration may produce negative
worth parameters or decrease the log-likelihood. \code{PlackettLuce} will
only apply the update when it makes an improvement.

The \code{"BFGS"} and \code{"L-BFGS"} algorithms are second order methods,
therefore can be quicker than the default method. Control parameters can be
passed on to \code{\link[stats]{optim}} or \code{\link[lbfgs]{lbfgs}}.

For models with priors, the iterative scaling method cannot be used, so BFGS
is used by default.
}

\examples{
# Six partial rankings of four objects, 1 is top rank, e.g
# first ranking: item 1, item 2
# second ranking: item 2, item 3, item 4, item 1
# third ranking: items 2, 3, 4 tie for first place, item 1 second
R <- matrix(c(1, 2, 0, 0,
              4, 1, 2, 3,
              2, 1, 1, 1,
              1, 2, 3, 0,
              2, 1, 1, 0,
              1, 0, 3, 2), nrow = 6, byrow = TRUE)
colnames(R) <- c("apple", "banana", "orange", "pear")

# create rankings object
R <- as.rankings(R)

# Standard maximum likelihood estimates
mod_mle <- PlackettLuce(R, npseudo = 0)
coef(mod_mle)

# Fit with default settings
mod <- PlackettLuce(R)
# log-worths are shrunk towards zero
coef(mod)

# independent N(0, 9) priors on log-worths, as in Raman and Joachims
prior <- list(mu = rep(0, ncol(R)),
              Sigma = diag(rep(9, ncol(R))))
mod_normal <- PlackettLuce(rankings = R, normal = prior)
# slightly weaker shrinkage effect vs pseudo-rankings,
# with less effect on tie parameters (but note small number of rankings here)
coef(mod_normal)

# estimate adherence assuming every ranking is from a separate ranker
mod_separate <- PlackettLuce(rankings = R, normal = prior, gamma = TRUE)
coef(mod_separate)
# gives more weight to rankers 4 & 6 which rank apple first,
# so worth of apple increased relative to banana
mod_separate$adherence

# estimate adherence based on grouped rankings
#  - assume two rankings from each ranker
G <- group(R, rep(1:3, each = 2))
mod_grouped <- PlackettLuce(rankings = G, normal = prior, gamma = TRUE)
coef(mod_grouped)
# first ranker is least consistent so down-weighted
mod_grouped$adherence

}
\references{
Raman, K. and Joachims, T. (2014)  Methods for Ordinal Peer Grading.
\href{https://arxiv.org/abs/1404.3656}{arXiv:1404.3656}.
}
\seealso{
Handling rankings: \code{\link{rankings}}, \code{\link{aggregate}},
\code{\link{group}}, \code{\link{choices}},
\code{\link{adjacency}}, \code{\link{connectivity}}.

Inspect fitted Plackett-Luce models: \code{\link{coef}}, \code{deviance},
\code{\link{fitted}}, \code{\link{itempar}}, \code{logLik}, \code{print},
\code{\link{qvcalc}}, \code{\link{summary}}, \code{\link{vcov}}.

Fit Plackett-Luce tree: \code{pltree}.

Example data sets: \code{\link{beans}}, \code{\link{nascar}},
\code{\link{pudding}}, \code{\link{preflib}}.

Vignette: \code{vignette("Overview", package = "PlackettLuce")}.
}
