% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/explicitRepresentation.R
\name{getExRep}
\alias{getExRep}
\alias{getExRepQuadratic}
\title{Explict Representation}
\usage{
getExRep(x, kernel = spectrumKernel(), sparse = TRUE,
  zeroFeatures = FALSE, features = NULL, useRowNames = TRUE,
  useColNames = TRUE, selx = NULL)

getExRepQuadratic(exRepLin, useRowNames = TRUE, useColNames = TRUE,
  zeroFeatures = FALSE)
}
\arguments{
\item{x}{one or multiple biological sequences in the form of a
\code{\linkS4class{DNAStringSet}}, \code{\linkS4class{RNAStringSet}},
\code{\linkS4class{AAStringSet}} (or as \code{\linkS4class{BioVector}})}

\item{kernel}{a sequence kernel object. The feature map of this kernel
object is used to generate the explicit representation.}

\item{sparse}{boolean that indicates whether a sparse or dense explicit
representation should be generated. Default=TRUE}

\item{zeroFeatures}{indicates whether columns with zero feature counts
across all samples should be included in the explicit representation.
(see below) Default=FALSE}

\item{features}{feature subset of the specified kernel in the form of a
character vector. When a feature subset is passed to the function all other
features in the feature space are not considered for the explicit
representation. (see below)}

\item{useRowNames}{if this parameter is set the sample names will be set
as row names if available in the provided sequence set.
Default=TRUE}

\item{useColNames}{if this parameter is set the features will be set
as column names in the explicit representation.
Default=TRUE}

\item{selx}{subset of indices into \code{x}. When this parameter is present
the explicit representation is generated for the specified subset of samples
only. default=\code{NULL}}

\item{exRepLin}{a linear explicit representation}
}
\value{
getExRep: upon successful completion, dependent on the flag \code{sparse}
the function returns either a dense explicit representation of class
\code{\linkS4class{ExplicitRepresentationDense}} or a sparse explicit
representation of class \code{\linkS4class{ExplicitRepresentationSparse}}.

getExRepQuadratic: upon successful completion, the function returns a
quadratic explicit representation
}
\description{
Create an explicit representation
}
\details{
Creation of an explicit representation\cr\cr
The function 'getExRep' creates an explicit representation of the given
sequence set using the feature map of the specified kernel. It contains
the feature counts in a matrix format. The rows of the matrix represent
the samples, the columns the features. For a dense explicit representation
of class \code{\linkS4class{ExplicitRepresentationDense}} the count data
is stored in a dense matrix. To allow efficient storage all features that
do not occur in the sequence set are removed from the explicit
representation by default. When the parameter \code{zeroFeatures} is set
to \code{TRUE} these features are also included resulting an explicit
representation which contains the full feature space. For feature spaces
larger than one million features the inclusion of zero features is not
possible. \cr\cr
In case of large feature spaces a sparse explicit representation of class
\code{\linkS4class{ExplicitRepresentationSparse}} is much more efficient
by storing the count data as \code{dgRMatrix} from package \bold{Matrix}).
The class \code{\linkS4class{ExplicitRepresentationSparse}}
is derived from \code{dgRMatrix}. As zero features are not stored in a
sparse matrix the flag \code{zeroFeatures} only controls whether the
column names of features not occuring in the sequences are included or
not.\cr\cr
Both the dense and the sparse explicit representation also contain the
kernel object which was used for it's creation. For an explicit
representation without zero features column names are mandatory.
An explicit representation can be created for position independent and
annotation specific kernel variants (for details see
\link{annotationMetadata}). In annotation specific kernels the
annotation characters are included as postfix in the features. For kernels
with normalization the explicit representation is normalized resulting in
row vectors normalized to the unit sphere. For feature subsets used with
normalized kernels all features of the feature space are used in the
normalization.\cr\cr
Usage of explicit representations\cr\cr
Learning with linear SVMs (e.g. \code{\link[kernlab]{ksvm}}in package
\bold{kernlab} or \code{\link[e1071]{svm}} in package \bold{e1071}) can be
performed either through passing a kernel matrix of similarity values or
an explicit representation and a linear kernel to the SVM. The SVMs in
package \bold{kernlab} support a dense explicit representation or kernel
matrix as data representations. The SVMs in packages \bold{e1071}) and
\bold{LiblineaR} support dense or sparse explicit representations.
In many cases there can be considerable performance differences between
the two variants of passing data to the SVM. And especially for larger
feature spaces the sparse explicit representation not only brings higher
memory efficiency but also leads to drastically improved runtimes during
training and prediction. Starting with kebabs version 1.2.0 kernel matrix
support is also available for package \code{e1071} via the dense LIBSVM
implementation integrated in package \code{kebabs}.\cr\cr
In general all of the complexity of converting the sequences with a specific
kernel to an explicit representation or a kernel matrix and adapting the
formats and parameters to the specific SVM is hidden within the KeBABS
training and predict methods (see \code{\link{kbsvm}},
\code{\link{predict}}) and the user can concentrate on the actual data
analysis task. During training via \code{\link{kbsvm}} the parameter
\code{explicit} controls the training via kernel matrix or explicit
representation and the parameter \code{explicitType} determines whether a
dense or sparse explicit representation is used. Manual generation of
explicit representations is only necessary for usage with other learners
or analysis methods not supported by KeBABS.\cr\cr
Quadratic explicit representation\cr\cr
The package \bold{LiblineaR} only provides linear SVMs which are tuned for
efficient processing of larger feature spaces and sample numbers. To allow
the use of a quadratic kernel on these SVMs a quadratic explicit
representation can be generated from the linear explicit representation.
It contains counts for feature pairs and the features combined to one pair
are separated by '_' in the column names of the quadratic explicit
representation. Please be aware that the dimensionality for a quadratic
explicit representation increases considerably compared to the linear one.
In the other SVMs a linear explicit representation together with a quadratic
kernel is used instead. In training via \code{\link{kbsvm}} the use
of a linear representation with a quadratic kernel or a quadratic explicit
representation instead is indicated through setting the parameter
\code{featureType} to the value \code{"quadratic"}.
}
\examples{
## instead of user provided sequences in XStringSet format
## for this example a set of DNA sequences is created
## RNA- or AA-sequences can be used as well with the spectrum kernel
dnaseqs <- DNAStringSet(c("AGACTTAAGGGACCTGGACACCACACTCAGCTAGGGGGACTGGGAGC",
                          "ATAAAGGGAGCAGACATCATGACCTTTTTGACCCTAATTATTTCAGC",
                          "CAGGAATCAGCACAGGCAGGGGCACTGCATCCCAAGACATCTGGGCC",
                          "GGACATATACCCACCCTTACCTGCCATACAGGATAGGGCCACTGCCC",
                          "ATAAAGGATGCAGACATCATGGCCTTTTTGACCCTAATTATTTCAGC"))
names(dnaseqs) <- paste("S", 1:length(dnaseqs), sep="")

## create the kernel object for dimers with normalization
speck <- spectrumKernel(k=2)
## show details of kernel object
speck

## generate the dense explicit representation for the kernel
erd <- getExRep(dnaseqs, speck, sparse=FALSE)
dim(erd)
erd[1:5,]

## generate the dense explicit representation with zero features
erd <- getExRep(dnaseqs, speck, sparse=FALSE, zeroFeatures=TRUE)
dim(erd)
erd[1:5,]

## generate the sparse explicit representation for the kernel
ers <- getExRep(dnaseqs, speck)
dim(ers)
ers[1:5,]

## generate the sparse explicit representation with zero features
ers <- getExRep(dnaseqs, speck, zeroFeatures=TRUE)
dim(ers)
ers[1:5,]

## generate the quadratic explicit representation
erdq <- getExRepQuadratic(erd)
dim(erdq)
erdq[1:5,1:15]

\dontrun{
## run taining and prediction with dense linear explicit representation
data(TFBS)
enhancerFB
train <- sample(1:length(enhancerFB), length(enhancerFB) * 0.7)
test <- c(1:length(enhancerFB))[-train]
model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=speck,
               pkg="LiblineaR", svm="C-svc", cost=10, explicit="yes",
               explicitType="dense")
pred <- predict(model, x=enhancerFB[test])
evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))

## run taining and prediction with sparse linear explicit representation
model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=speck,
               pkg="LiblineaR", svm="C-svc", cost=10, explicit="yes",
               explicitType="sparse")
pred <- predict(model, x=enhancerFB[test])
evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))
}
}
\author{
Johannes Palme <kebabs@bioinf.jku.at>
}
\references{
\url{http://www.bioinf.jku.at/software/kebabs/}\cr\cr
J. Palme, S. Hochreiter, and U. Bodenhofer (2015) KeBABS: an R package
for kernel-based analysis of biological sequences.
\emph{Bioinformatics}, 31(15):2574-2576, 2015.
DOI: \doi{10.1093/bioinformatics/btv176}.
}
\seealso{
\code{\linkS4class{ExplicitRepresentationDense}},
\code{\linkS4class{ExplicitRepresentationSparse}},
\code{\link{getKernelMatrix}}, \code{\link{kernelParameters-method}},
\code{\linkS4class{SpectrumKernel}}, \code{\link{mismatchKernel}},
\code{\link{gappyPairKernel}}, \code{\link{motifKernel}}
}
\keyword{explicit}
\keyword{methods}
\keyword{representation}

