## Saturday, November 8, 2014

### Machine Learning: Closed form of Linear Regression

A follow up from the previous Machine Learning Post: Machine Learning: Linear Regression

(Credit: Thanks to Stanford University for their online course and Prof. Andrew Ng for great materials and discussions. All credit goes to them. I am merely posting my thoughts for future references)

In linear regressions, we want to find $$\theta$$ such that the least squares function $$J(\theta) = \frac{1}{2} \sum_i (h_{\theta}(x_i) - y_i)^2)$$ is minimized. One of the way to go is by implementing the gradient descent algorithm:
for each j:
update $$\theta_j = \theta - \alpha \frac{\partial}{\partial \theta_j} J(\theta)$$

#include <iostream>
#include <cstdio>
#include <algorithm>
using namespace std;

#define NMAX 20
#define MMAX 50000

double EPS = 1e-12;

/*
Linear Regression
Given set of features X, and some values Y,
find the hypothesis that minimizes J(theta)
where h_theta(X) = theta_0 x_0 + theta_1 x_1 + ... (linear)

*/
int N; // dimension of x and y
int M; // number of data
double theta[NMAX], temp[NMAX];
double data[MMAX][NMAX], Y[MMAX];
double ALPHA = 0.07;
int ITER = 1500;

int main(){
int dump;
scanf("%d %d", &N, &M);
for(int i=0;i<M;++i){
data[i][0] = 1;
//scanf("%d", &dump);
for(int j=1;j<=N;++j){
scanf("%lf", &data[i][j]);
}
}
for(int i=0;i<M;++i){
scanf("%lf", &Y[i]);
}
// initialize theta
for(int i=0;i<=N;++i){
theta[i] = 0;
}
for (int iteration = 0; iteration <= ITER; ++iteration) {
for(int j=0;j<=N;++j){
double sum = 0;
for(int i=0;i<M;++i){
double tmp = 0;
for(int k=0;k<=N;++k){
tmp += theta[k] * data[i][k];
}
tmp -= Y[i];
tmp *= data[i][j];
sum += tmp;
}
sum *= ALPHA/(double)M;
temp[j] = theta[j] - sum;
}
for(int j=0;j<=N;++j){
theta[j] = temp[j];
}
}
for(int j=0;j<=N;++j){
printf("%lf\n", theta[j]);
}
return 0;
}


Since gradient descent is an iterative algorithm, the number of iteration and the learning rate $$\alpha$$ are both important parameters which will determine the stability and the rate of convergence of the algorithm.

However, for linear regression, we are entitled to one extra approach which is radically different from that of the iterative approach, by resorting to linear algebra and deriving the closed form of $$\theta$$. Given X the $$m \times (n+1)$$ matrix of our data on our features, and Y the $$m \times 1$$ matrix of actual results, $$\theta$$ that minimizes the least square regression is the solution to the following matrix equation:
$$X^T X \theta = X^T Y$$
There are several proofs to this result, one that uses the notion of projections and orthogonality, and there is also one that uses matrix of partial derivatives and manipulation of traces of matrices to obtain the result. Both approach are manageable and understandable.

From here, we can apply LUP decomposition to solve for $$\theta$$ in $$O(N^3)$$ running time, which I think can be way faster than the iterative approach for small values of $$N$$. However, for big $$N$$, as in when the data from training set is huge, it might be better off using the iterative approach especially since having a big $$O(N^2)$$ matrices can be very memory consuming.

/*
Closed Form of Linear Regression
Helper Classes: Matrix and LUP Decomposition
*/
#include <iostream>
#include <cstdio>
#include <algorithm>
#include <vector>
#include <cassert>
#include <cmath>
using namespace std;

double EPS = 1e-12;

struct Matrix {
/* STATE */

vector<vector<double> > entry;  // representation of the matrix
int row;                        // number of rows
int col;                        // number of columns

/* CONSTRUCTOR */

Matrix() {
// VOID
}

Matrix(int r, int c) {
row = r;
col = c;
entry = vector<vector<double> >(row, vector<double>(col, 0));
}

/* STATE MODIFIER METHODS */

void set(int i, int j, double val) {
entry[i][j] = val;
}

void replace_with(const Matrix& another) {
entry = another.entry;
row = another.row;
col = another.col;
}

void multiply_with(const Matrix& another) {
assert(col == another.row);
Matrix temp(row, another.col);
for (int i = 0; i < row; ++i) {
for (int j = 0; j < another.col; ++j) {
temp.entry[i][j] = 0;
for (int k = 0; k < col; ++k) {
temp.entry[i][j] += entry[i][k] * another.entry[k][j];
}
}
}
replace_with(temp);
col = another.col;
}

/* STATE NON-MODIFIER METHODS */

double get(int i, int j) {
return entry[i][j];
}

void print() {
for (int i = 0; i < row; ++i) {
for (int j = 0; j < col; ++j) {
printf("%lf ", entry[i][j]);
}
printf("\n");
}
}

void get_transpose(Matrix& dest) {
dest.entry = vector<vector<double> >(col, vector<double>(row, 0));
dest.row = col;
dest.col = row;
for (int i = 0; i < row; ++i) {
for (int j = 0; j < col; ++j) {
dest.entry[j][i] = entry[i][j];
}
}
}
};

class LUP_Algorithm {
private:
Matrix X;
Matrix Y;
vector<int> P;

bool equals_to_zero(double val) {
return (fabs(val) < EPS);
}

void X_swap_row(int i, int j) {
vector<double> temp;
temp.assign(X.entry[i].begin(), X.entry[i].end());
X.entry[i].assign(X.entry[j].begin(), X.entry[j].end());
X.entry[j].assign(temp.begin(), temp.end());
}

void LUP_Decomposition() {
for (int i = 0; i < X.row; ++i) {
P.push_back(i);
}
for (int k = 0; k < X.col; ++k) {
bool singular = true;
int index = -1;
for (int i = k; i < X.row; ++i) {
if (!equals_to_zero(X.get(i, k))) {
singular = false;
index = i;
break;
}
}
assert(singular == false);
swap(P[k], P[index]);
X_swap_row(k, index);
for (int i = k+1; i < X.row; ++i) {
X.entry[i][k] /= X.entry[k][k];
for (int j = k+1; j < X.col; ++j) {
X.entry[i][j] -= X.entry[i][k] * X.entry[k][j];
}
}
}
Matrix pY(Y.row, 1);
for (int i = 0; i < Y.row; ++i) {
pY.entry[P[i]][0] = Y.entry[i][0];
}
Y.replace_with(pY);
}

public:

LUP_Algorithm(Matrix x, Matrix y) {
assert(x.row == y.row);
assert(x.row == x.col);
assert(y.col == 1);
X.replace_with(x);
Y.replace_with(y);
}

Matrix solve() {
LUP_Decomposition();
Matrix theta(X.row, 1);
for (int i = 0; i < X.row; ++i) {
theta.entry[i][0] = Y.entry[i][0];
for (int j = i-1; j >= 0; --j) {
theta.entry[i][0] -= theta.entry[j][0] * X.entry[i][j];
}
}
Y.replace_with(theta);
for (int i = X.row - 1; i >= 0; --i) {
theta.entry[i][0] = Y.entry[i][0];
for (int j = i+1; j < X.row; ++j) {
theta.entry[i][0] -= theta.entry[j][0] * X.entry[i][j];
}
theta.entry[i][0] /= X.entry[i][i];
}
return theta;
}

};

/*
Given X and Y,
solution to
transpose(X) X T = transpose(X) Y
minimizes the least squares regression.
*/

int main(){
int N, M;
scanf("%d %d", &N, &M);
Matrix X(M, N+1);
Matrix Y(M, 1);
double val;
for (int i = 0; i < M; ++i) {
X.set(i, 0, 1);
for (int j = 1; j <= N; ++j){
scanf("%lf", &val);
X.set(i, j, val);
}
}
for (int j = 0; j < M; ++j) {
scanf("%lf", &val);
Y.set(j, 0, val);
}
Matrix XT;
Matrix YT;
X.get_transpose(XT);
YT.replace_with(XT);
XT.multiply_with(X);
YT.multiply_with(Y);
LUP_Algorithm LUP(XT, YT);
Matrix res = LUP.solve();
res.print();
return 0;
}