\documentclass[11pt]{article}
\usepackage{../../apacite,amsmath,latexsym,epsfig,float,afterpage,../../numinsec,alltt,theorem}
% \usepackage{amsmath,latexsym,epsfig}
\newfloat{Algorithm}{thp}{}[section]
\floatname{Algorithm}{Algorithm}
\setlength{\textwidth}{6.0in}
\setlength{\oddsidemargin}{23pt}
\setlength{\evensidemargin}{23pt}
\setlength{\topmargin}{-0.5in}
\setlength{\textheight}{8.8in}
\newcommand{\Proof}{\noindent{\bf Proof.}~}
% \newcommand{\qed}{$ \blacksquare $ \medskip}
\newcommand{\qed}{$ \Box $ \medskip}
\newcommand{\libsvm}{$\mbox{{\sf LIBSVM}}$}
\newcommand{\bsvm}{$\mbox{{\sf BSVM}}$}
\newcommand{\svmlight}{$SVM^{light}$}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{algorithm}[theorem]{Algorithm}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\renewcommand{\thetheorem}{\thesection.\arabic{theorem}}
%\renewcommand{\thefigure}{\thesection.\arabic{figure}}
\renewcommand{\thetable}{\thesection.\arabic{table}}
\renewcommand{\thefootnote}{\fnsymbol{footnote}}    
\newcommand{\tv}{\mbox{$\tilde{v}$}}
\newcommand{\tf}{\mbox{$\tilde{\nabla}$}}

\theoremstyle{break}
\newtheorem{algorithm1}[theorem]{Algorithm}

\begin{document}
\setlength{\baselineskip}{18pt}
\renewcommand{\thefootnote}{\fnsymbol{footnote}}    

\begin{center}
{\Large\bf LIBSVM: a Library for Support 
Vector Machines (Version 2.2)}

\bigskip

{\bf Chih-Chung Chang and
 Chih-Jen Lin\footnotemark[1]}

\end{center}

\begin{abstract}
\libsvm\ is a library
for support 
vector machines (SVM).
Its goal is to help users can
easily use SVM as a tool.
In this document, we present all 
its implementation details.
\end{abstract}

\footnotetext [1]
{
Department of Computer Science and
Information Engineering,
National Taiwan University, 
Taipei 106, Taiwan ({\tt cjlin@csie.ntu.edu.tw}).
}

\section{Introduction}

\libsvm\ is a library
for support 
vector classification (SVM) and regression.
Its goal is to let users can
easily use SVM as a tool.
In this document, we present all 
its implementation details.

In Section \ref{formulation},
we show formulations used in 
\libsvm:
$C$-support vector classification
($C$-SVC),
$\nu$-support vector classification
 ($\nu$-SVC),
distribution estimation (one-class SVM),
$\epsilon$-support vector regression ($\epsilon$-SVR),
and
$\nu$-support vector regression ($\nu$-SVR).
We discuss the implementation 
of solving quadratic problems in Section 
\ref{qp}. 
Section \ref{shrinking}
describes two implementation techniques:
shrinking and caching.
Then in Section \ref{multi}
we discuss the implementation of
multi-class classification.

\section{Formulations}
\label{formulation}

\subsection{$C$-Support Vector Classification (Binary
Case)}
Given training vectors $x_i \in R^n, i = 1, 
\ldots,l$, 
in two classes, 
and a vector
$y\in R^l$ such that
$
y_i \in \{1, -1 \}$,
$C$-SVC 
\shortcite{CC95a,VV98a}
solves the following
primal problem:
\begin{eqnarray}
 \min_{w,b,\xi} && \frac{1}{2} w^T w
+ C \sum_{i=1}^l \xi_i \label{primal} \\
&& y_i (w^T \phi(x_i) + b) \geq 1 - \xi_i,
\nonumber \\
&& \xi_i \geq 0, i = 1, \ldots, l. \nonumber 
\end{eqnarray}
Its dual is
\begin{eqnarray}
\min_{\alpha} && \frac{1}{2}\alpha^TQ\alpha - e^T\alpha \nonumber \\
&& 0 \leq \alpha_i \leq C, \qquad i = 1, \ldots, l,   \label{svmqp}\\
&& y^T \alpha = 0, \nonumber
\end{eqnarray}
where $e$ is the vector of all ones,
$C$ is the upper bound,
$Q$ is an 
$l$ by $l$ positive semidefinite matrix, 
$Q_{ij} \equiv y_i y_jK(x_i,x_j)$,
and $K(x_i,x_j) \equiv \phi(x_i)^T
\phi(x_j)$ is the kernel.
Here training vectors $x_i$ are mapped into a
higher (maybe infinite) dimensional space 
by the function $\phi$.


The 
decision function is
\begin{equation*}
f(x) = 
sign(\sum_{i=1}^l 
y_i \alpha_i K(x_i, x) + b).  
\end{equation*}

\subsection{$\nu$-Support Vector Classification
(Binary Case)}
The $\nu$-support vector 
classification
\shortcite{BS00a} 
uses a new parameter $\nu$
which let one control the number
of support vectors and errors. 
The parameter $\nu \in (0,1]$ 
is an upper bound on the fraction
of training errors and
a lower bound of the fraction of
support vectors.

Details of the algorithm implemented
in \libsvm\ can be found in \shortcite{CC00a}.
Given training vectors $x_i \in R^n, i = 1, 
\ldots,l$, 
in two classes, 
and a vector
$y\in R^l$ such that
$
y_i \in \{1, -1 \}$,
the primal form considered is:
\begin{eqnarray*}
\min_{w,b,\xi,\rho} && \frac{1}{2} w^T w 
 - \nu \rho
+ \frac{1}{l} \sum_{i=1}^l \xi_i  \\
&& y_i (w^T \phi(x_i) + b) \geq \rho - \xi_i, 
\nonumber \\
&& \xi_i \geq 0, i = 1, \ldots, l, \rho \geq 0. 
\nonumber 
\end{eqnarray*}

The dual is:
\begin{eqnarray}
 \min_{\alpha} && \frac{1}{2}\alpha^T
Q\alpha  \nonumber  \\
 && 0 \leq \alpha_i \leq 1/l, \qquad i = 1, \ldots, l,   \label{newsvmqp1} \\
&& e^T \alpha \geq \nu,  \nonumber\\
&& y^T \alpha = 0.  \nonumber 
\end{eqnarray}
where 
$Q_{ij} \equiv y_i y_jK(x_i,x_j)$.

The decision function is:
\begin{equation*}
f(x) = 
sign(\sum_{i=1}^l 
y_i \alpha_i (K(x_i, x) + b)).
\end{equation*}

In \shortcite{DJC99a,CC00a}, it has been 
shown that $e^T \alpha \geq \nu$
can be replaced by
$e^T \alpha = \nu$.
With this property,
in \libsvm, we solve a scaled version of 
(\ref{newsvmqp1}):
\begin{eqnarray}
\min_{\alpha} && \frac{1}{2}\alpha^T
Q\alpha  \nonumber  \\
 && 0 \leq \alpha_i \leq 1, \qquad i = 1, \ldots, l,    \nonumber \\
&& e^T \alpha = \nu l,  \nonumber \\
&& y^T \alpha = 0.  \nonumber
\end{eqnarray}

We output 
$\alpha/\rho$ so the
computed decision function is:
\begin{equation*}
f(x) = 
sign(\sum_{i=1}^l 
y_i (\alpha_i/\rho) (K(x_i, x) + b))
\end{equation*}
and then two margins are 
\begin{equation*}
 y_i (w^T \phi(x_i) + b) = \pm 1
\end{equation*}
which are the same as those of $C$-SVC.

\subsection{Distribution Estimation 
(One-class SVM)}

One-class SVM 
was  proposed 
by Sch\"{o}lkopf et al.~\citeyear{BS99b}
for 
estimating the support of 
a high-dimensional 
distribution. 
Given training vectors $x_i \in R^n, i = 1, 
\ldots,l$ without any class information, 
the primal form 
in \shortcite{BS99b} is:
\begin{eqnarray*}
\min_{w,b,\xi,\rho} && \frac{1}{2} w^T w - \rho
+ \frac{1}{\nu l} \sum_{i=1}^l \xi_i  \\
&& w^T \phi(x_i) \geq \rho - \xi_i, 
\nonumber \\
&& \xi_i \geq 0, i = 1, \ldots, l. 
 \nonumber 
\end{eqnarray*}
The dual is:
\begin{eqnarray}
 \min_{\alpha} && \frac{1}{2}\alpha^TQ\alpha  \nonumber  \\
&&0 \leq \alpha_i \leq 1/(\nu l), 
i = 1, \ldots, l,    \label{newsvmqp}\\
&&  e^T \alpha =1,  \nonumber
\end{eqnarray}
where 
$Q_{ij} =
K(x_i,x_j) \equiv \phi(x_i)^T
\phi(x_j)$.
 
In \libsvm\ we solve a scaled 
version of (\ref{newsvmqp}):

\begin{eqnarray}
\min && \frac{1}{2}\alpha^T
Q\alpha  \nonumber  \\
 && 0 \leq \alpha_i \leq 1, \qquad i = 1, \ldots, l,    \nonumber \\
&& e^T \alpha = \nu l.  \nonumber
\end{eqnarray}

The decision function is
\begin{equation*}
f(x) = 
sign(\sum_{i=1}^l 
\alpha_i K(x_i, x) - \rho).  
\end{equation*}

\subsection{$\epsilon$-Support Vector Regression
($\epsilon$-SVR)}
Given a set of data points,
$\{(x_1, z_1), \ldots, 
(x_l, z_l)\}$, such that
$x_i \in R^n$ is
an input and $z_i \in R^1$
is a target output, 
the standard form of support vector 
regression 
\shortcite{VV98a}
is:
\begin{eqnarray*}
 \min_{w,b,\xi,\xi^*} &&\frac{1}{2} 
w^Tw 
+ C \sum_{i=1}^l
\xi_i
+ C \sum_{i=1}^l
\xi_i^* 
 \\
&& z_i - w^T \phi(x_i) - b \leq 
\epsilon + \xi_i ,  \\
&& w^T \phi(x_i) + b -z_i\leq 
\epsilon + \xi_i^* ,  \\
&& \xi_i, \xi_i^* \geq 0, 
i = 1, \ldots, l. 
\end{eqnarray*}

The dual is:
\begin{eqnarray}
 \min_{\alpha, \alpha^*} && \frac{1}{2}
(\alpha - \alpha^*)^T
Q(\alpha - \alpha^*) 
+\epsilon 
\sum_{i=1}^l (\alpha_i
+ \alpha_i^*)
+ \sum_{i=1}^l 
z_i (\alpha_i - \alpha^*_i) 
\nonumber  \\
&& 
\sum_{i=1}^l (\alpha_i 
- \alpha_i^*) = 0,0 \leq \alpha_i,
\alpha_i^* \leq C, i = 1, \ldots,l,    \label{svr}
\end{eqnarray}
where $Q_{ij} =
K(x_i,x_j) \equiv \phi(x_i)^T
\phi(x_j)$.

The decision function is:
\begin{equation*}
f(x) 
= \sum_{i=1}^l 
(-\alpha_i + \alpha_i^*) K(x_i,x)
+ b.
\end{equation*}

\subsection{$\nu$-Support Vector Regression
($\nu$-SVR)}
Similar to $\nu$-SVC,
for regression,
\shortcite{BS00a} 
use a parameter $\nu$ to control
the number of support vectors.
However, unlike $\nu$-SVC where 
$C$ is replaced by $\nu$
here $\nu$ replaces the parameter
$\epsilon$ of $\epsilon$-SVR.
The primal form is 
\begin{eqnarray}
 \min_{w,b,\xi,\xi^*,\epsilon} && \frac{1}{2} 
w^Tw 
+ C (\nu \epsilon 
+ \frac{1}{l} 
\sum_{i=1}^l (\xi_i + \xi^*_i))\label{primal1}  \\
&& (w^T \phi({x_i}) + b) - z_i 
\leq \epsilon  + \xi_i, 
\nonumber \\
&& z_i - (w^T \phi({x_i}) + b)
\leq \epsilon  + \xi^*_i,
\nonumber \\
&& \xi_i , \xi_i^* \geq 0, i = 1, \ldots, l, 
\; \epsilon \geq 0. \nonumber 
\end{eqnarray}
and the dual is
\begin{eqnarray}
\min_{\alpha,\alpha^*} && \frac{1}{2}
(\alpha-\alpha^*)^T 
Q (\alpha -\alpha^*) 
+ z^T (\alpha - \alpha^*) \nonumber  \\
&&  e^T(\alpha - \alpha^*) = 0, \;
 e^T (\alpha+\alpha^*) \leq C\nu,  \nonumber \\
&&0 \leq \alpha_i, \alpha^*_i \leq C/l, \qquad i = 1, \ldots, l,    \label{newsvmqp2}
\end{eqnarray}

Similarly the inequality 
$
 e^T (\alpha+\alpha^*) \leq C\nu$
can be replaced by an equality.
In \libsvm, we consider
$C \leftarrow C/l$ so the dual problem solved
is:
\begin{eqnarray}
 \min_{\alpha, \alpha^*} && \frac{1}{2}
(\alpha-\alpha^*)^T 
Q (\alpha -\alpha^*) 
+ z^T (\alpha - \alpha^*) \nonumber  \\
&&  e^T(\alpha - \alpha^*) = 0, \;
 e^T (\alpha+\alpha^*) = Cl\nu,  \nonumber \\
&&0 \leq \alpha_i, \alpha^*_i \leq C, \qquad i = 1, \ldots, l.    \label{nusvr}
\end{eqnarray}
Then the decision function is
\begin{equation*}
f(x) 
= \sum_{i=1}^l 
(-\alpha_i + \alpha_i^*) K(x_i,x)
+ b,
\end{equation*}
the same as that of $\epsilon$-SVR.

\section{Solving the Quadratic Problems}
\label{qp}

\subsection{The Decomposition Method
for $C$-SVC, $\epsilon$-SVR, and 
One-class SVM}

We consider the following general form
of $C$-SVC, $\epsilon$-SVR, and one-class
SVM:
\begin{eqnarray}
\min_{\alpha} && \frac{1}{2}
\alpha^T Q \alpha + p^T \alpha 
\nonumber \\
&& y^T \alpha = \Delta,
  \label{general} \\
&& 0 \leq \alpha_t \leq C, t 
= 1, \ldots, l, \nonumber 
\end{eqnarray}
where $y_t = 
\pm 1, t = 1, \ldots, l$.
It can be clearly seen that
$C$-SVC and one-class SVM are
already in the form of (\ref{general}).
For $\epsilon$-SVR, we consider the following 
reformulation of 
(\ref{svr}):
\begin{eqnarray}
 \min_{\alpha, \alpha^*} && \frac{1}{2}
\begin{bmatrix}
\alpha^T, (\alpha^*)^T
\end{bmatrix}
\begin{bmatrix}
Q & -Q \nonumber \\
-Q & Q 
\end{bmatrix}
\begin{bmatrix}
\alpha \\ 
\alpha^*
\end{bmatrix}
+
\begin{bmatrix}
\epsilon e^T + z^T,
\epsilon e^T - z^T
\end{bmatrix}
\begin{bmatrix}
\alpha \\ \alpha^*
\end{bmatrix}
 \\
&& 
y^T 
\begin{bmatrix}
\alpha \\ \alpha^*
\end{bmatrix}
= 0,0 \leq \alpha_t,
\alpha_t^* \leq C, t = 1, \ldots,l,   
\label{svr1}
\end{eqnarray}
where
$y$ is a $2l$ by 1 vector with
$y_t = 1, t = 1, 
\ldots, l$ and $y_t = -1,
t = l+1, \ldots, 2l$.

The difficulty of 
solving (\ref{general}) is the density of
$Q$ because $Q_{ij}$ is in general not zero.
In \libsvm, we consider the decomposition 
method to conquer this difficulty.
Some work on this method are, for example, 
Osuna et al.
\citeyear{EO97a},
Joachims \citeyear{TJ98a},
Platt \citeyear{JP98a},
and Saunders et al. \citeyear{CS98a}.

\begin{algorithm1}[Decomposition method]
\label{decomp}
  \begin{enumerate}
  \item Given a number $q \leq l$ as
the size of the working set. 
Find $\alpha^1$ as the initial solution. 
Set $k = 1$.
\item 
If $\alpha^k$ is an optimal solution
of (\ref{svmqp}), stop. Otherwise,
find a working set $B 
\subset \{1, \ldots, l\}$ whose size
is $q$.
Define 
$N \equiv \{1, \ldots, l\} \backslash B$
and $\alpha^k_B$ and $\alpha^k_N$
to be sub-vectors of $\alpha^k$
corresponding to $B$ and $N$,
respectively.
\item Solve the following sub-problem
with the variable $\alpha_B$:
\begin{eqnarray}
 \min_{\alpha_B} && \frac{1}{2} \alpha_B^T Q_{BB} \alpha_B 
+ (p_B + Q_{BN} \alpha^k_N)^T \alpha_B \nonumber \\
&& 0 \leq (\alpha_B)_t \leq C,  
t = 1, \ldots, q, \label{subqp} \\
&& y_B^T \alpha_B = \Delta -y_N^T \alpha^k_N,   \nonumber 
\end{eqnarray}
where 
$\left[
\begin{smallmatrix}
 Q_{BB} & Q_{BN} \\
Q_{NB} & Q_{NN}
\end{smallmatrix}
\right]
$
is a permutation of the matrix $Q$.

\item 
Set $\alpha^{k+1}_B$
to be the optimal solution of (\ref{subqp}) 
and $\alpha^{k+1}_N
\equiv \alpha^k_N$.
Set $k 
\leftarrow k + 1$ and goto 
Step 2.
  \end{enumerate}
\end{algorithm1}
The basic idea of the
decomposition method 
is that in each iteration,
the indices $\{1, \ldots, l \}$ of the 
training set are separated to two
sets $B$ and $N$, where
$B$ is the working set
and $N 
= \{1, \ldots, l\} \backslash B$.
The vector $\alpha_N$ is fixed 
so the objective value becomes
$\frac{1}{2} \alpha_B^T Q_{BB} \alpha_B 
- (p_B - Q_{BN} \alpha_N)^T \alpha_B 
+ \frac{1}{2} \alpha_N^T Q_{NN} \alpha_N - 
p_N^T \alpha_N$. 
Then a sub-problem 
with the variable $\alpha_B$,
i.e. (\ref{subqp}),
is solved.
Note that $B$ is updated in each
iteration. To simplify the notation,
we simply use $B$ instead of $B^k$.
The strict decrease of the objective
function holds and the
theoretical convergence was studied
in Chang et al. \citeyear{CC00b},
Keerthi \citeyear{SSK00a},
and Lin \citeyear{CJL00b}.



\subsection{Working Set Selection
and
Stopping Criteria 
for $C$-SVC, 
$\epsilon$-SVR, and One-class SVM} 

An important issue of the decomposition
method is the selection of the working 
set $B$.
The KKT condition of (\ref{general})
shows that there is a scalar $b$
such that
\begin{equation}
\label{kkt}
\begin{array}{lllll}
y_t = 1 , \alpha_t < C \qquad
& \Rightarrow &
(Q \alpha + p)_t + b \geq 0 
& \Rightarrow &
b \geq - (Q \alpha +p)_t 
= -\nabla f(\alpha_k)_t,
\\
y_t = -1 , \alpha_t > 0 \qquad
&  \Rightarrow &
(Q \alpha + p)_t - b \leq 0
&  \Rightarrow &
b \geq (Q \alpha + p)_t 
= \nabla f(\alpha_k)_t,
\\
y_t = -1 , \alpha_t < C \qquad
&   \Rightarrow &
(Q \alpha + p)_t - b \geq 0 
&  \Rightarrow &
b \leq (Q \alpha + p)_t
= \nabla f(\alpha_k)_t,
\\
y_t = 1, \alpha_t > 0 \qquad
&  \Rightarrow &
(Q \alpha + p)_t + b \leq 0 
&  \Rightarrow &
b \leq  - (Q \alpha + p)_t
= -\nabla f(\alpha_k)_t,
\end{array}
\end{equation}
where 
$f(\alpha)
\equiv \frac{1}{2} \alpha^T Q \alpha
- e^T \alpha$
and
$\nabla f(\alpha_k)$ is the gradient
of 
$f(\alpha)$ at $\alpha_k$.
We consider 
\begin{eqnarray}
&& i \equiv 
\mbox{argmax}
(\{-\nabla f(\alpha_k)_t 
\mid
y_t  = 1, \alpha_t < C\},
\{\nabla f(\alpha_k)_t 
\mid
y_t  = -1, \alpha_t > 0\}), 
\label{eq1}
\\
&& j \equiv
\mbox{argmin}
(\{\nabla f(\alpha_k)_t 
\mid
y_t  = -1, \alpha_t < C\},
\{-\nabla f(\alpha_k)_t 
\mid
y_t  = 1, \alpha_t > 0\}).
\label{eq2}
\end{eqnarray}
We then use 
$B = \{i, j\}$ as the working
set for the sub-problem
(\ref{subqp}) of the decomposition method.
Here $i$ and $j$ are the two elements which
violate the KKT condition the most.
The idea of using only two 
elements for the working set 
are from the 
Sequential 
Minimal Optimization (SMO) by
Platt \citeyear{JP98a}.
The main advantage is that an analytic solution
of (\ref{subqp}) can be obtained so there
is no need to use an optimization software.
Note that 
(\ref{eq1}) and (\ref{eq2}) 
are a special case of the
working set selection of the 
software \svmlight\ by
 Joachims \citeyear{TJ98a}.
To be more precise,
in \svmlight, 
the following problem is solved:
\begin{eqnarray}
\min_d && \nabla f(\alpha_k)^T d \nonumber \\
&& y^Td = 0, \;  -1 \leq d \leq 1, \label{svmlight1} \\
&& d_t \geq 0, \mbox{ if } (\alpha_k)_t = 0, \; 
d_t \leq 0, \mbox{ if } (\alpha_k)_t = C, \nonumber \\
&& |\{d_t \mid d_t \neq 0\}| = q. \label{svmlightq}
\end{eqnarray}
Note that $| \{ d_t \mid d_t \neq 0 \}|$
means the number of components of $d$ which are not zero.
The constraint (\ref{svmlightq}) implies that a 
descent direction involving only $q$ variables is
obtained. Then  components of $\alpha_k$ with 
non-zero $d_t$ are included in the working set
$B$ which is used
to construct the sub-problem (\ref{subqp}). 
Note that $d$ is only used for identifying 
$B$ but not as a search direction.

It can clearly seen that if $q=2$,
the solution of
(\ref{svmlight1}) is
\begin{eqnarray*}
&& i = 
\mbox{argmin}
\{\nabla f(\alpha_k)_t d_t
\mid
y_t d_t = 1; 
 d_t \geq 0, \mbox{ if }
\alpha_t = 0;
 d_t \leq 0, \mbox{ if }
\alpha_t = C.
\}, \\
&& j = \mbox{argmin}
\{\nabla f(\alpha_k)_t d_t
\mid
y_t d_t = -1; 
 d_t \geq 0, \mbox{ if }
\alpha_t = 0;
 d_t \leq 0, \mbox{ if }
\alpha_t = C.
\},
\end{eqnarray*}
which is the same as (\ref{eq1}) and
(\ref{eq2}).
We also notice that
this is also 
the modification 2 of 
the algorithms in \cite{SSK99a}.

We then define
\begin{equation}
\label{O1}
g_{i} 
\equiv \begin{cases}
-\nabla f(\alpha_k)_{i}
& \mbox{if }
y_{i} = 1, \alpha_{i} < C,\\
\nabla f(\alpha_k)_{i}
& \mbox{if }
y_{i} = -1, \alpha_{i} > 0,
\end{cases}
\end{equation}
and 
\begin{equation}
\label{O2}
g_{j} 
\equiv \begin{cases}
-\nabla f(\alpha_k)_{j}
& \mbox{if }
y_{j} = -1, \alpha_{j} < C,\\
\nabla f(\alpha_k)_{j}
& \mbox{if }
y_{j} = 1, \alpha_{j} > 0.
\end{cases}
\end{equation}
From (\ref{kkt}),
we know
\begin{equation}
\label{stop_noepsilon}
g_{i}  \leq - g_{j}
\end{equation}
implies 
that 
$\alpha_k$ is an optimal solution of 
(\ref{svmqp}).
Practically the stopping
criteria can be written and
implemented as: 
\begin{equation}
\label{stop}
g_{i}  \leq - g_{j} + \epsilon,
\end{equation}
where $\epsilon$ is a small
positive 
number.


\subsection{The Decomposition Method
for $\nu$-SVC and $\nu$-SVR}

Both $\nu$-SVC and $\nu$-SVR can be 
considered as the following general 
form:
\begin{eqnarray}
\min_{\alpha} && \frac{1}{2}
\alpha^T Q \alpha + p^T \alpha 
\nonumber \\
&& y^T \alpha = \Delta_1,
  \label{general1} \\
&& e^T \alpha = \Delta_2,
\nonumber \\
&& 0 \leq \alpha_t \leq C, t 
= 1, \ldots, l. \nonumber 
\end{eqnarray}
The decomposition method is the
same as Algorithm
\ref{decomp}
but the sub-problem is 
different:
\begin{eqnarray}
 \min_{\alpha_B} && \frac{1}{2} \alpha_B^T Q_{BB} \alpha_B +
(p_B + Q_{BN} \alpha_N^k)^T \alpha_B \nonumber \\
&& y_B^T \alpha_B = \Delta_1 -y_N^T \alpha_N,  \label{subqp1}  \\
&& e_B^T \alpha_B = \Delta_2 -e_N^T \alpha_N,   \nonumber\\
&& 0 \leq (\alpha_B)_t \leq C,  t = 1, \ldots, q.   \nonumber
\end{eqnarray}

Now if only two elements
$i$ and $j$ are selected but
$y_{i} \neq y_{j}$, then
$y_B^T \alpha_B = \Delta_1 -y_N^T \alpha_N$
and 
$e_B^T \alpha_B = \Delta_2 -e_N^T \alpha_N$
imply that there
are two equations with two variables
so
(\ref{subqp1}) has only one feasible point.
Therefore, from 
$\alpha_k$, the solution cannot
be moved any more.
On the other hand, if 
$y_{i} = y_{j}$,
$y_B^T \alpha_B = \Delta_1 -y_N^T \alpha_N$
and 
$e_B^T \alpha_B = \Delta_2 -e_N^T \alpha_N$
become the same equality so there are 
multiple feasible solutions.
Therefore,
we have to keep $y_{i} = 
y_{j}$ while selecting the working set.

The KKT condition
of (\ref{general1}) shows
\begin{eqnarray*}
\nabla f(\alpha)_i - \rho + by_i
& = & 0
\mbox{ if } 0 < \alpha_i < C, \\
& \geq & 0
\mbox{ if }  \alpha_i = 0, \\
& \leq & 0
\mbox{ if }  \alpha_i = C.
\end{eqnarray*}
Define
\begin{equation*}
r_1 \equiv \rho - b, \; 
r_2 \equiv \rho + b.
\end{equation*}
If $y_i = 1$ the KKT condition becomes
\begin{eqnarray}
\nabla f(\alpha)_i - r_1
& \geq & 0 
\mbox{ if }  \alpha_i < C, \label{y1free} \\
& \leq & 0 
\mbox{ if }  \alpha_i > 0. \nonumber
\end{eqnarray}
On the other hand, 
if $y_i = -1$, it is
\begin{eqnarray}
\nabla f(\alpha)_i - r_2
& \geq & 0 
\mbox{ if }  \alpha_i < C, \label{y2free} \\
& \leq & 0 
\mbox{ if }  \alpha_i > 0. \nonumber
\end{eqnarray}

Hence, indices
$i$ and $j$ are selected from 
either
\begin{equation}
\label{selyp}
\begin{array}{l}
 i = \mbox{argmin}_t \{ \nabla f(\alpha_k)_t |
y_t = 1, (\alpha_k)_t < C \},  \\
 j = \mbox{argmax}_t \{ \nabla f(\alpha_k)_t | 
y_t = 1, (\alpha_k)_t > 0\}, 
\end{array}
\end{equation}
or
\begin{equation}
\label{selyn}
\begin{array}{l}
i = \mbox{argmin}_t \{ \nabla f(\alpha_k)_t | 
y_t = -1, (\alpha_k)_t < C \},  \\
j = \mbox{argmax}_t \{ \nabla f(\alpha_k)_t | 
y_t = -1, (\alpha_k)_t > 0\}, 
\end{array}
\end{equation}
depending on which one gives a smaller
$\nabla f(\alpha_k)_i - \nabla f(\alpha_k)_j$
(i.e. larger KKT violations).

This was first proposed in 
\shortcite{SSK00a}. 
Some details can be seen in 
\shortcite[Section 4]{CC00a}.

The stopping criterion 
will be described in 
Section \ref{bandrho}.

\subsection{Analytical Solutions}
Now (\ref{subqp}) is a simple problem
with only two variables:
\begin{eqnarray}
 \min_{\alpha_i, \alpha_j} & & \frac{1}{2}
\begin{bmatrix}
\alpha_{i} & 
\alpha_{j}
\end{bmatrix}
\begin{bmatrix}
Q_{ii} & Q_{ij} \\
Q_{ji} & Q_{jj} 
\end{bmatrix}
\begin{bmatrix}
\alpha_i \\
\alpha_j 
\end{bmatrix} 
+ 
(Q_{i,N}\alpha_N -1)
\alpha_i
+
(Q_{j,N}\alpha_N -1)
\alpha_j
\nonumber \\
&& 
y_i \alpha_i
+ 
y_j \alpha_j  
= \Delta_1 - y_N^T \alpha_N^k, 
\label{2varqp} \\
&& 0 \leq 
\alpha_i, 
\alpha_j 
 \leq C. \nonumber 
\end{eqnarray}
Platt  
\citeyear{JP98a} substituted 
$\alpha_{i}
= y_{i} 
(\Delta_1 -y_N^T \alpha_N 
- y_{j} \alpha_{j})$
into the objective function
of (\ref{subqp}) and solved
an unconstrained 
minimization on 
$\alpha_{j}$.
The following solution is 
obtained:
\begin{equation}
\label{alphanew}
\alpha_{j}^{new}
= 
\begin{cases}
\alpha_{j} + 
\frac{- G_i
-G_j}
{Q_{i i}+Q_{j j}+
2Q_{i j}}
& \mbox{ if } y_{i} \neq
y_{j}, \\
\alpha_{j} + 
\frac{
G_i - G_j
}
{Q_{i i}+Q_{j j}-
2Q_{i j}}
& \mbox{ if } y_{i} =
y_{j},
\end{cases}
\end{equation}
where 
\begin{equation*}
  G_i \equiv \nabla f(\alpha)_i 
\mbox{ and }
  G_j \equiv \nabla f(\alpha)_j.
\end{equation*}
If this value is outside
the possible region
of $\alpha_{j}$
(that is, exceeds the feasible
region of (\ref{subqp})), 
the value of (\ref{alphanew})
is clipped into the feasible
region and is assigned as the new 
$\alpha_{j}$.
For example, if 
$y_i \neq y_j$
and 
$C \leq \alpha_i + 
\alpha_j \leq 2C$,
$\alpha_j^{new}$ must satisfy
\begin{equation*}
L 
\equiv
\alpha_i + \alpha_j
- C \leq
\alpha_j^{new} 
\leq C \equiv H  
\end{equation*}
as the largest value 
$\alpha_i^{new}$
and
$\alpha_j^{new}$
can be 
is $C$.
Hence if
\begin{equation*}
\alpha_{j} + 
\frac{-G_i- 
G_j}
{Q_{i i}+Q_{j j}+
2Q_{i j}}
\leq L,  
\end{equation*}
we define
$\alpha_j^{new} \equiv L$
and then 
\begin{equation}
  \label{error}
\alpha_i^{new} 
= \alpha_i
+ \alpha_j - \alpha_j^{new}= C.
\end{equation}
However, numerically the last equality of
(\ref{error}) may not hold.
The floating-point operation will cause 
that 
\begin{eqnarray*}
&&\alpha_i
+ \alpha_j - \alpha_j^{new}\\
&= & 
\alpha_i
+ \alpha_j - 
(\alpha_i
+ \alpha_j - C) \\
& \neq & C.
\end{eqnarray*}
Therefore, in most SVM software, 
a small tolerance $\epsilon_a$
is specified and 
all 
$\alpha_i \geq C - \epsilon_a$ are 
considered to be at the upper bound and
all
$\alpha_i \leq \epsilon_a$ are considered 
to be zero.
This is necessary as otherwise some data
will be wrongly considered as support
vectors. In addition, the calculation 
of the bias term $b$ also need
correct identification of those
$\alpha_i$ which are free
(i.e. $0 < \alpha_i < C$).

In \shortcite{CWH99a}, it has been 
pointed out that if 
all bounded $\alpha_i$ obtain their
values using direct assignments,
there is no need of using an $\epsilon_a$.
To be more precise, for 
floating-point computation, if $\alpha_i
\leftarrow C$ is assigned somewhere,
a future 
floating-point comparison between
$C$ and $C$ returns true as they 
both have the same internal 
representation.
Therefore, we use the following 
segment of code (if 
$y_i \neq y_j$) where all numbers at bounds
are specifically assigned:

\addtolength{\baselineskip}{-6pt}
\begin{verbatim}
        if(y[i]!=y[j])
        {
                double delta = (-G[i]-G[j])/(Q_i[i]+Q_j[j]+2*Q_i[j]);
                double diff = alpha[i] - alpha[j];
                alpha[i] += delta;
                alpha[j] += delta;
                if(diff > 0)
                {
                        if(alpha[i] > C)
                        {
                                alpha[i] = C;
                                alpha[j] = C - diff;
                        }
                        else if(alpha[j] < 0)
                        {
                                alpha[j] = 0;
                                alpha[i] = diff;
                        }
                }
                {
                        if(alpha[j] > C)
                        {
                                alpha[j] = C;
                                alpha[i] = C + diff;
                        }
                        else if(alpha[i] < 0)
                        {
                                alpha[i] = 0;
                                alpha[j] = -diff;
                        }
                }
        }
\end{verbatim}
\addtolength{\baselineskip}{+6pt}

Though this involves a little more
operations,
as solving the analytic solution
of (\ref{2varqp})
takes only a small portion
of the total computational
time, the difference is negligible.

Another minor problem 
is that the denominator
in (\ref{alphanew}) 
is sometime zero.
When this happens,
\begin{equation*}
Q_{ij}  = \pm (Q_{ii} + Q_{ij})/2 
\end{equation*}
so
\begin{eqnarray*}
&& Q_{ii} Q_{jj} - Q_{ij}^2 \\
&=&   Q_{ii} Q_{jj} - (Q_{ii} + Q_{jj})^2/4 \\
&= & -(Q_{ii} - Q_{ij})^2/4 \leq 0.
\end{eqnarray*}
Therefore, we know if $Q_{BB}$ is positive 
definite, 
the zero denominator
in (\ref{alphanew}) 
never happens. 
Hence this problem happens only if 
$Q_{BB}$ is a 2 by 2 singular
matrix.
We discuss some situations where
$Q_{BB}$ may be singular.
\begin{enumerate}
\item The function 
$\phi$ does not map
data to independent vectors
in a higher-dimensional space
so $Q$ is only positive 
semidefinite. For example,
using the linear
or low-degree polynomial
kernels.
Then it is possible that 
a singular $Q_{BB}$ 
is picked.
\item Some kernels have 
a nice property that 
$\phi(x_i), i = 1, \ldots,
l$ are independent if 
$x_i \neq x_j$. 
Thus $Q$ as well as 
all possible 
$Q_{BB}$
are positive definite.
An example is the 
RBF kernel
\cite{CAM86a}.
However, for many practical
data we have encountered,
some of $x_i, i = 1,
\ldots, l$ are the same.
Therefore, several rows
(columns) of $Q$ are 
exactly the same so 
$Q_{BB}$ may be singular.
\end{enumerate}

However, 
even if
the denominator
of (\ref{alphanew}) 
is zero, there are no
numerical problems:
From (\ref{stop}), we note
that
\begin{equation*}
g_{i} + g_{j} \geq \epsilon
\end{equation*}
during the iterative process.
Since
\begin{eqnarray*}
g_i + g_j
& = & \pm(-G_i - G_j) \mbox{ if } y_i \neq y_j,
\mbox{ and } \\
g_i + g_j
& = & \pm(G_i - G_j)
\mbox{ if } y_i = y_j, 
\end{eqnarray*}
the situation of 
$0/0$ which is defined as 
NaN by IEEE standard does not 
appear. Therefore,
(\ref{alphanew}) returns
$\pm \infty$ 
if
the denominator is zero which 
can be 
detected as special quantity 
of 
IEEE standard
and clipped to
regular
      floating point number.

\subsection{The Calculation of 
$b$ or $\rho$}
\label{bandrho}

After the solution $\alpha$ of
the dual optimization problem is obtained,
the variables $b$ or $\rho$
must be calculated as they are used
in the decision function.
Here we simply describe the
case of $\nu$-SVC and
$\nu$-SVR where 
$b$ and $\rho$ both appear.
Other formulations are simplified
cases of them.

The KKT condition
of (\ref{general1}) has been shown in 
(\ref{y1free})
and 
(\ref{y2free}). Now we consider the case
of $y_i = 1$.
If there are $\alpha_i$ 
which  satisfy 
$0 < \alpha_i < C$,
then
$r_1 = \nabla f(\alpha)_i$.
Practically to avoid numerical errors,
we average them:
\begin{equation*}
  r_1 = 
\frac{\sum_{0 < \alpha_i < C, y_i = 1}
\nabla f(\alpha)_i }
{\sum_{0 < \alpha_i < C, y_i = 1} 1}.
\end{equation*}
On the other hand, if there is no such
$\alpha_i$,
as $r_1$ must satisfy
\begin{equation*}
\max_{\alpha_i = C, y_i = 1} \nabla f(\alpha)_i
\leq 
r_1 \leq \min_{\alpha_i = 0, y_i = 1} 
\nabla f(\alpha)_i ,
\end{equation*}
we take $r_1$ the midpoint of the range.

For $y_i = -1$, we can calculate $r_2$
in a similar way.

After $r_1$ and $r_2$ are obtained, 
\begin{equation*}
  \rho = \frac{r_1 + r_2}{2}
\mbox{ and }
-b = \frac{r_1 - r_2}{2}.
\end{equation*}

Note that the KKT condition can be written as
\begin{equation*}
\max_{\alpha_i >0, y_i = 1} \nabla f(\alpha)_i
\leq 
\min_{\alpha_i <C, y_i = 1} 
\nabla f(\alpha)_i 
  \end{equation*}
and
\begin{equation*}
\max_{\alpha_i >0, y_i = -1} 
\nabla f(\alpha)_i
\leq 
\min_{\alpha_i <C, y_i = -1} 
\nabla f(\alpha)_i.
\end{equation*}
Hence practically we can use
the following stopping criterion:
The decomposition method
stops if the iterate $\alpha$
satisfies the following condition:
\begin{eqnarray}
& \max( &
-\min_{\alpha_i <C, y_i = 1} 
\nabla f(\alpha)_i 
+
\max_{\alpha_i >0, y_i = 1} \nabla f(\alpha)_i, \\
\label{nustop}
&& -\min_{\alpha_i <C, y_i = -1} 
\nabla f(\alpha)_i
+
\max_{\alpha_i >0, y_i = -1} 
\nabla f(\alpha)_i
)
< \epsilon,
\end{eqnarray}
where $\epsilon>0$ is a chosen
stopping tolerance.

\section{Shrinking and Caching}
\label{shrinking}

Since for many problems
the number of free support 
vectors
(i.e. $0 < \alpha_i < C$) is small,
the shrinking technique  reduces the 
size of the working problem
without considering some bounded variables
\shortcite{TJ98a}.
Near the end of 
the iterative process, the 
decomposition method
identifies a possible set $A$
where all final free $\alpha_i$
may reside in.
Then instead of solving the whole
problem (\ref{svmqp}), 
the decomposition method
works on a smaller
 problem:
\begin{eqnarray}
\min_{\alpha_A} && \frac{1}{2} \alpha_A^T Q_{AA} \alpha_A 
- (p_A - Q_{AN} \alpha^k_N)^T \alpha_A \nonumber \\
&& 0 \leq (\alpha_A)_t \leq C,  
t = 1, \ldots, q, \label{shrink} \\
&& y_A^T \alpha_A = \Delta -y_N^T \alpha^k_N,   \nonumber 
\end{eqnarray}
where $N = \{1, \ldots, l\} 
\backslash A$.

Of course this heuristic may fail if the
optimal solution of (\ref{shrink}) is not
the corresponding part of 
that of (\ref{svmqp}).
When that happens, the whole 
problem (\ref{svmqp}) is reoptimized starting
from a point
$\alpha$ where 
$\alpha_B$ is an optimal solution
of (\ref{shrink}) and
$\alpha_N$ are bounded variables 
identified before the shrinking process. 
Note that while
solving the shrinked problem 
(\ref{shrink}),
 we only know the
gradient 
$Q_{AA} \alpha_A + Q_{AN} \alpha_N
+ p_A$ of (\ref{shrink}).
Hence 
when problem (\ref{svmqp}) is reoptimized
we also have
to reconstruct the whole gradient
$\nabla f(\alpha_k)$, which is quite 
expensive.

Many implementations began the shrinking 
procedure near the end of the
iterative process,
in \libsvm however, we start the shrinking 
process from the 
beginning.
The procedure is as follows:
\begin{enumerate}
\item 
\label{do_shrink}
After every $\min(l, 1000)$
iterations, we try to shrink 
some variables.
Note that during the iterative
process
\begin{eqnarray}
&& \min
(\{\nabla f(\alpha_k)_t 
\mid
y_t  = -1, \alpha_t < C\},
\{-\nabla f(\alpha_k)_t 
\mid
y_t  = 1, \alpha_t > 0\})
= -g_j \nonumber \\
& < & g_i = 
\max
(\{-\nabla f(\alpha_k)_t 
\mid
y_t  = 1, \alpha_t < C\},
\{\nabla f(\alpha_k)_t 
\mid
y_t  = -1, \alpha_t > 0\}), 
\label{gji}
\end{eqnarray}
as (\ref{stop_noepsilon})
is not satisfied yet.

We conjecture that for those
\begin{equation}
\label{shrink1}
g_{t} 
= \begin{cases}
-\nabla f(\alpha_k)_{t}
& \mbox{if }
y_{t} = 1, \alpha_{t} < C,\\
\nabla f(\alpha_k)_{t}
& \mbox{if }
y_{t} = -1, \alpha_{t} > 0,
\end{cases}
\end{equation}
if 
\begin{equation}
g_t \leq -g_j,
\label{gtj}
\end{equation}
and $\alpha_t$
resides at a bound, then
the value of 
$\alpha_t$ may not change any more.
Hence we inactivate this variable.
Similarly, for those
\begin{equation}
\label{shrink2}
g_{t} 
\equiv \begin{cases}
-\nabla f(\alpha_k)_{t}
& \mbox{if }
y_{t} = -1, \alpha_{t} < C,\\
\nabla f(\alpha_k)_{t}
& \mbox{if }
y_{t} = 1, \alpha_{t} > 0,
\end{cases}
\end{equation}
if 
\begin{equation}
-g_t \geq g_i,  
\label{gti}
\end{equation}
and $\alpha_t$ is at a bound, it
is inactivated.
Thus the set $A$ of activated variables 
is dynamically reduced in 
every $\min(l,1000)$ iterations.

\item Of course the above shrinking
strategy may be too aggressive.
Since the decomposition method has 
a very slow convergence and
a large portion of 
iterations are spent for achieving 
the final digit of the required
accuracy, we 
would not like those iterations are wasted 
because of 
a wrongly shrinked problem
(\ref{shrink}). Hence when the
decomposition method first achieves 
the tolerance
\begin{equation*}
g_{i}  \leq - g_{j} + 10\epsilon,
\end{equation*}
where $\epsilon$ is the specified
stopping criteria, we reconstruct
the whole gradient.  Then based on the
correct information, we use criteria
like (\ref{shrink1}) and (\ref{shrink2})
to inactivate some variables and
the
decomposition method continues.
\end{enumerate}

Therefore, in \libsvm, the size of the
set $A$ of
(\ref{shrink}) is dynamically reduced.
To decrease the cost of reconstructing
the gradient 
$\nabla f(\alpha_k)$, during the iterations
we always keep 
\begin{equation*}
\bar{G}_i 
= \sum_{\alpha_j = C} Q_{ij}, i  = 1, \ldots, l. 
\end{equation*}
Then for the gradient 
$\nabla f(\alpha)_i, i \notin A$, we have
\begin{equation*}
\nabla f(\alpha)_i
= \sum_{j=1}^l 
Q_{ij} \alpha_j 
= C\bar{G}_i  + \sum_{0 < \alpha_j < C}
Q_{ij} \alpha_j.
\end{equation*}

For $\nu$-SVC and $\nu$-SVR, as the
stopping condition
(\ref{nustop})
is different from 
(\ref{stop}),
the shrinking strategies 
(\ref{gtj}) and (\ref{gti})
must be modified.
From (\ref{gji}),  now we have to 
separate two cases:
$y_t = 1$ and $y_t = -1$.
For $y_y = 1$,
(\ref{gji}) becomes 
\begin{eqnarray*}
&& \min
\{-\nabla f(\alpha_k)_t 
\mid
y_t  = 1, \alpha_t > 0\}
= -g_j \nonumber \\
& < & g_i = 
\max
\{-\nabla f(\alpha_k)_t 
\mid
y_t  = 1, \alpha_t < C\}
\end{eqnarray*}
so we inactivate those 
$\alpha_t$ with 
\begin{equation*}
-\nabla f(\alpha_k)
\leq -g_j 
\mbox{ if }
y_t = 1 \mbox{ and }
\alpha_t < C,  
\end{equation*}
and
\begin{equation*}
-\nabla f(\alpha_k)
\geq g_i 
\mbox{ if }
y_t = 1 \mbox{ and }
\alpha_t > 0.
\end{equation*}
The case for $y_t = -1$ is similar.

Another technique for reducing the
computational time 
is caching.
Since $Q$ is fully dense and may not be 
stored in the computer memory,
elements $Q_{ij}$ are calculated
as needed.
Then
usually a special storage using the idea of a cache
is used to store recently used $Q_{ij}$
\shortcite{TJ98a}.
Hence the computational cost of later iterations
can be reduced. 

In \libsvm, we implement a simple
least-recent-use strategy for the cache.
We dynamically cache only recently used
columns of $Q_{AA}$ of 
(\ref{shrink}).

\section{Multi-class classification}
\label{multi}

We use the ``one-against-one'' approach
\shortcite{SK90a} in which 
$k(k-1)/2$ classifiers 
are constructed and
each one trains data from
two different classes.
The first use of this strategy
on SVM was in
\shortcite{JF96a,Kreel99}.
For training data from the $i$th
 and the $j$th classes, we solve
the following binary classification
problem:
  \begin{eqnarray*}
\min_{w^{ij}, b^{ij}, \xi^{ij}} && 
\frac{1}{2}(w^{ij})^T w^{ij} + 
C(\sum_{t} (\xi^{ij})_t) 
\nonumber \\
&& ((w^{ij})^T \phi(x_t)) + b^{ij}) 
\geq 1 - \xi^{ij}_t,  
\mbox{ if $x_t$ in the $i$th class,} \nonumber \\
&& ((w^{ij})^T \phi(x_t)) + b^{ij}) 
\leq -1 + \xi^{ij}_t,  
\mbox{ if $x_t$ in the $j$th class,} \nonumber \\
&& \xi^{ij}_t \geq 0.
\nonumber
  \end{eqnarray*}
In classification we use a voting
strategy:
each binary classification
is considered to be a voting where votes can be cast for all data
points $x$ - in the the end point is designated to be in a class
with maximum number of votes.

Another method for multi-class classification
is the ``one-against-all''
approach in which $k$
SVM models are constructed and
the $i$th SVM is trained with
all of the examples
in the $i$th class with
positive labels,
and all other examples with negative labels.
We did not consider it
as some research work (e.g. \shortcite{JW98a,JP00a})
have shown that it 
does not perform as good as ``one-against-one''

In addition, though we have to train 
as many as $k(k-1)/2$
classifiers,
as each problem is smaller
(only data from two classes), the total
training time may not be more than
the ``one-against-all'' method.
This is one of the reasons why we choose
the ``one-against-one'' method.


\section*{Acknowledgments}
This work was supported in part by
the National Science Council of Taiwan via the grants
NSC 89-2213-E-002-013
and NSC 89-2213-E-002-106.
The authors thank Chih-Wei Hsu and
Jen-Hao Lee
for many helpful discussions and comments.
We also thank Ryszard Czerminski for some
useful comments.

%---------------------bibliography

\bibliographystyle{../../apacite}
% \bibliographystyle{/usr/faculty/professor/cjlin/latex/chicago}
\bibliography{../../sdp}
%\bibliography {bibtex,bibtex2,cp}

\end{document}
