NumDL-CourseNotes/07-NumDNN-IntroNonlinearModel.tex at master · nuraiman/NumDL-CourseNotes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
\documentclass[12pt,fleqn,handout]{beamer}

\input{beamerStyle.tex}
\input{abbrv.tex}


\title{Introduction to Nonlinear Models}
\subtitle{Numerical Methods for Deep Learning}
\date{}

\begin{document}

\makebeamertitle

\input{slides/introNonlinearModels.tex}

\begin{frame}\frametitle{Example: Linear Fitting}


Assume $\bfC\in \R^{n_c\times n}$, $\bfY \in \R^{n_f \times n}$ and $n \gg n_f$.
Goal: Find $\bfW \in \R^{n_c \times n_f}$ such that

$$ \bfC = \bfW \bfY $$

\bigskip
\pause

If ${\rm rank}(\bfY)<n$, may not be possible to fit the data.

\bigskip
\pause

Two options:
\begin{enumerate}
	\item Regression: Solve $\min_\bfW \| \bfW \bfY - \bfC \|_F^2$ $\leadsto$ always has solutions, but residual might be large
	\item Nonlinear Model: Replace $\bfY$ by $\sigma(\bfK\bfY)$, where $\sigma$ is element-wise function (aka activation) and $\bfK \in \R^{m \times n_f}$ where $m \gg n_f$
\end{enumerate}

\end{frame}


\begin{frame}\frametitle{Illustrating Nonlinear Models}

\begin{center}
	\begin{tabular}{cc}
		\rotatebox{90}{original} & \includegraphics[width=.9\textwidth]{elmSmall}\\
		 \invisible<beamer|1>{\rotatebox{90}{transformed}} &
		\invisible<beamer|1>{\includegraphics[width=.9\textwidth]{elmBig}}\\
	\end{tabular}
\end{center}

\bigskip

\invisible<beamer|1>{
Remarks
\begin{itemize}
	\item instead of $\bfW \bfY = \bfC$ solve $\hat{\bfW} \sigma(\bfK \bfY)  = \bfC$
	\item solve bigger problem $\leadsto$ memory, computation, \ldots
	\item what happens to ${\rm rank}(\sigma(\bfK\bfY))$ when $\sigma(x)=x$?
\end{itemize}}

\only<beamer|2>{}
\end{frame}


\begin{frame}[fragile]\frametitle{Universal Approximation Theorem}

Given the data $\bfY \in \R^{n_f \times n}$ and $\bfC \in \R^{n_c \times n}$
with $n\gg n_f$
There is nonlinear function $\sigma:\R \to \R$, a matrix $\bfK \in R^{m \times n_f}$, and a bias $b \in \R$ such that

$$
 {\rm rank}(\sigma(\bfK \bfY + b)) = n.
$$

\bigskip
\pause
Therefore, possible~\cite{Cybenko1989,HornikEtAl1989} to find ${\bfW}\in\R^{n_c\times m}$

$$\bfW \sigma( \bfK \bfY +b) {\bfW}= \bfC $$

\end{frame}


\begin{frame}[fragile]\frametitle{Choosing Nonlinear Model}

$$ \bfW  \sigma(\bfK \bfY+b)= \bfC $$
\begin{itemize}
\item how to choose $\sigma$?
\pause
\begin{itemize}
	\item early days: motivated by neurons
	\item popular choice: $\sigma(x) = \tanh(x)$ (smooth, bounded, \ldots)
	\item nowadays: $\sigma(x) = \max(x,0)$ (aka ReLU, rectified linear unit, non-differentiable, not bounded, simple)
\end{itemize}
\pause
\item how to choose $\bfK$ and $b$?
\pause
\begin{itemize}
	\item pick randomly $\leadsto$ branded as \emph{extreme learning machines}~\cite{HuangEtAl2006}
	\item train (optimize) $\leadsto$ deep learning (when we have multiple layers)
\end{itemize}
\end{itemize}


\end{frame}

\begin{frame}[fragile]\frametitle{First Experiment: Random Transformation}

Select activation function and choose $\bfK$ and $b$ randomly and solve the least-squares/classification problem

\bigskip

The Pros:
\begin{itemize}
\item universal approximation theorem: can interpolate any function
\item very easy to program
\item can serve as a benchmark to more sophisticated methods
\end{itemize}

\bigskip

Some concerns:
\begin{itemize}
\item may require very large $\bfK$ (size of the data)
\item may not generalize well
\item large dense linear algebra
\end{itemize}

\end{frame}


\begin{frame}[allowframebreaks]
	\frametitle{References}
\bibliographystyle{abbrv}
\bibliography{NumDNN}
\end{frame}

\end{document}