// Released under the GNU General Public License (see license.txt for details).
//
// Copyright (c) 2011 Chuan-Sheng Foo.
// All Rights Reserved.
//
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <cassert>
#include <climits>
#include <cstring>

/* Random number generator seed */
#define SEED 947 

using namespace std;

typedef pair<int,double> VecPair;

enum solver_type {UNKNOWN, OPTIMISTIC_ADAPTIVE, OPTIMISTIC_PROXIMAL, ADAPTIVE, PEGASOS, PROXIMAL};

// adapted from Pegasos
class SparseVector {
public:    
    SparseVector() {};
    SparseVector(istringstream& is, int n); // make sparse vector from line in input
    void scale(double s);
    double normsq() const;
    double dot_product(const vector<double>& y) const;
    int max_index() const;
    void print(ostream& os = cerr) const;
private:
    vector<VecPair> elems;
    friend class WeightVector;
};

// n specifies how many non-zero components there are
SparseVector::SparseVector(istringstream& is, int n) {
    for (int i = 0; i < n; i++) {
        int index;
        is >> index;
        double feature;
        is >> feature;
        elems.push_back(make_pair(index - 1, feature));
    }
}

void SparseVector::scale(double s) {
    for (vector<VecPair>::iterator it = elems.begin(); it != elems.end(); ++it) {
        it->second *= s;
    }
}

double SparseVector::normsq() const {
    double ans = 0;
    for (vector<VecPair>::const_iterator it = elems.begin(); it != elems.end(); ++it) {
        ans += it->second * it->second;
    }
    return ans;
}

double SparseVector::dot_product(const vector<double>& y) const {
    double ans = 0;
    for (vector<VecPair>::const_iterator it = elems.begin(); it != elems.end(); ++it) {
        ans += it->second * y[it->first];
    }
    return ans;
}

int SparseVector::max_index() const {
    if (elems.begin() != elems.end())
        return (--elems.end())->first;
    else
        return 0;
}

void SparseVector::print(ostream& os) const {
    for (vector<VecPair>::const_iterator it = elems.begin(); it != elems.end(); ++it) {
        os << "(" << it->first << "," << it->second << ") ";
    }
    os << endl;
}

/* 
   Fast implementation of SVM weight vector to allow updates in time
   proportional to number of non zero components in training example.
   Adapted from Pegasos, with the addition of rescaling to avoid
   underflow.  
*/
class WeightVector {
public:
    WeightVector(int dim):a(1), normsq(0) {
        v.resize(dim, 0);
    };
    double norm() const {
        return sqrt(normsq);
    };
    void scale(const double alpha);
    void add_scaled(const double alpha, const SparseVector& u);
    double dot_product(const SparseVector& u) const;
    double operator[](int idx) const {
        return a*v[idx];
    };
    int size() const {
        return v.size();
    };
private:
    double a;
    vector<double> v;
    double normsq;
	void normalize();
};

void WeightVector::normalize() {
	for (int i = 0; i < v.size(); i++)
		v[i] *= a;
	a = 1;
}

/* Rescaling tolerance level */
const double SCALE_TOL = 1e-200; 

void WeightVector::scale(const double alpha) {
    normsq *= alpha*alpha;
    a *= alpha;

    if (a == 0) {
        a = 1;
        fill(v.begin(), v.end(), 0);
    }
	
	if (a < SCALE_TOL) {
		cout << "rescaling weight vector..." << endl;
		normalize();
	}
}

/*
  normsq is updated using the identity:
    ||w + alpha*u||^2
    = <w + alpha*u, w + alpha*u>
    = ||w||^2 + alpha^2*||u||^2 + 2*alpha*<w, u>
 */
void WeightVector::add_scaled(const double alpha, const SparseVector& u) {
    double wTu = 0;
    double unormsq = 0;

    if (a != 0) {
        for (vector<VecPair>::const_iterator it = u.elems.begin(); it != u.elems.end(); ++it) {
            wTu += it->second * v[it->first];
            v[it->first] += alpha*it->second / a;
            unormsq += it->second * it->second;
        }
        wTu *= a;
    } else {
        a = 1;
        fill(v.begin(), v.end(), 0);
        for (vector<VecPair>::const_iterator it = u.elems.begin(); it != u.elems.end(); ++it) {
            v[it->first] = alpha*it->second;
            unormsq += it->second * it->second;
        }
    }

    normsq += alpha*alpha*unormsq + 2*alpha*wTu;
}

double WeightVector::dot_product(const SparseVector& u) const {
    return a*u.dot_product(v);
}

// Adapted from Pegasos' file input
void read_file(string& filename, vector<SparseVector>& data, vector<int>& labels,
               int& dim) {
    dim = 0;
    data.clear();
    labels.clear();

    ifstream in_file(filename.c_str());
    if (!in_file.good()) {
        cerr << "Error opening " << filename << endl;
        exit(-1);
    }

    int num_examples = 0;
    string buf;

    while (getline(in_file, buf)) {
        if (buf[0] == '#') continue;
        size_t pos = buf.find('#');
        if (pos < buf.size()) buf.erase(pos);

        int num_nonzero = 0;
        for (size_t i = 0; i < buf.size(); i++)
            if (buf[i] == ':') {
                buf[i] = ' ';
                num_nonzero++;
            }

        istringstream iss(buf);
        int label = 0;
        iss >> label;
        if (label != 1 && label != -1) {
            cerr << "Invalid Class Label: Class label must be +1 or -1" << endl;
            exit(-1);
        }
        labels.push_back(label);

        SparseVector example(iss, num_nonzero);
        dim = max(dim, example.max_index());
        data.push_back(example);
        num_examples++;
    }

    dim++;

    cerr << num_examples << " examples read, dim = " << dim << endl;

    in_file.close();
}

// Compute bound on subgradient norm
// G = max_i ||x^{(i)}|| + sqrt(lambda)
double compute_G(const vector<SparseVector>& data, double lambda) {
    double maxnormsq = -1;

    for (int i = 0; i < data.size(); i++) {
        double normsq = data[i].normsq();
        maxnormsq = max(maxnormsq, normsq);
    }
    return sqrt(maxnormsq) + sqrt(lambda);
}

// To Be Implemented
int compute_T(double G, double deltahat, double lambda, double R, double epsilon) {
    return INT_MAX; // for now, since delta = 0 implies T = infinity
}

double compute_svm_loss(const WeightVector& w, const vector<SparseVector>& data, const vector<int>& labels, double lambda) {
    double loss = 0;
    double reg = 0.5*lambda*w.norm()*w.norm();

    int m = data.size();

    for (int i = 0; i < m; i++) {
        loss += max(0.0, 1 - labels[i]*w.dot_product(data[i])) / m;
    }
	
    return reg + loss;
}

void write_model(const string& output_filename, const WeightVector& w, double lambda) {
    ofstream outf(output_filename.c_str());

    outf << lambda << endl;
    outf << w.size() << endl;
    for (int i = 0; i < w.size(); i++) {
        outf << w[i] << endl;
    }

    outf.close();
}

template <class T>
void print_vector(const T& w) {
    double normsq = 0;
    for (int i = 0; i < w.size(); i++) {
        if (w[i] != 0) {
            cout << "w[" << i << "]=" << w[i] << endl;
            normsq += w[i]*w[i];
        }
    }
    cout << "||w||^2 = " << normsq << endl;
}

void train_generic_fast(const string& output_filename, const vector<SparseVector>& data, const vector<int>& labels, int dim, double lambda, int steps, enum solver_type solver) {
    const double lambdainv = 1.0 / lambda;
	
	WeightVector w(dim);
	
    srand(SEED);


	if (solver == PEGASOS) {
		for (int t = 1; t <= steps; t++) {

			double tinv = 1.0 / t;
			//double eta = 1.0 / (lambda*t);
			double eta = tinv * lambdainv;

			/* sample the training example */
			int r = random() % data.size(); 
			
			/* Regularization gradient */
			//w.scale(1 - eta*lambda);
			w.scale(1 - tinv);

			/* Hinge-loss subgradient */
			if (labels[r] * w.dot_product(data[r]) < 1) {
				w.add_scaled(eta*labels[r], data[r]);
			}

			/* Projection */
			if (w.norm() > 1.0/sqrt(lambda)) {
				w.scale(1.0/(w.norm() * sqrt(lambda)));
			}
		}	
			
	} else {

		const double G = compute_G(data, lambda);
		const double R = 1.0/sqrt(lambda);
		
		const double zero = 0;
		double tau = 0;
		
		double GR_mult = 0;
		const double *reg_add;

		switch (solver) {
		case ADAPTIVE:
			GR_mult = (double)8/3;
			reg_add = &tau;
			break;
		case PROXIMAL:
			GR_mult = 1;
			reg_add = &zero;
			break;
		default:
			cerr << "Unsupported solver type in train_generic_fast." << endl;
			return;
		}
		
		double tausum = 0;

		for (int t = 1; t <= steps; t++) {

			int r = random() % data.size(); // sample the training example

            tau = 0.5 * (-lambda*t - tausum + sqrt((lambda*t + tausum)*(lambda*t + tausum) + GR_mult*(G*G)/(R*R)));
            tausum += tau;
            double eta = 1.0 / (lambda*t + tausum);

			/* Regularization gradient */
			w.scale(1 - eta*(lambda + (*reg_add)));

			/* Hinge-loss subgradient */
			if (labels[r] * w.dot_product(data[r]) < 1) {
				w.add_scaled(eta*labels[r], data[r]);
			}

			/* Projection */
			if (w.norm() > 1.0/sqrt(lambda)) {
				w.scale(1.0/(w.norm() * sqrt(lambda)));
			}
		}
	}
	cout << "After " << steps << " iterations (" << (double)steps/ data.size() << " effective iterations):" << endl;
	cout << "  Regularized loss=" << compute_svm_loss(w, data, labels, lambda) << endl;
	cout << "  Norm of weight vector=" << w.norm() << endl;
	write_model(output_filename, w, lambda);
}

void train_optimistic_generic_fast(const string& output_filename, const vector<SparseVector>& data, const vector<int>& labels, int dim, double lambda, double epsilon, double delta, int steps, enum solver_type solver) {

    const double zero = 0;
    double tau = 0;

    double GR_mult = 0;
    const double *reg_add;

    switch (solver) {
    case OPTIMISTIC_ADAPTIVE:
        GR_mult = (double)8/3;
        reg_add = &tau;
        break;
    case OPTIMISTIC_PROXIMAL:
        GR_mult = 1;
        reg_add = &zero;
        break;
    default:
        cerr << "Unsupported solver type for optimistic strategy." << endl;
        return;
    }

    const double G = compute_G(data, lambda);
    const double deltahat = delta / (3 - log(lambda)/log((double)2));
    double R = 1;

    WeightVector w(dim);

    srand(SEED);

    cout << "G = " << G << endl;
    cout << "deltahat = " << deltahat << endl;

    int numprocessed = 0;

    bool converged = true;

    for (int outerits = 0; outerits < INT_MAX; outerits++) {
        double tausum = 0;

        cout << "[ Starting outer iteration " << outerits << "]" << endl;
        int T = compute_T(G, deltahat, lambda, R, epsilon);
        for (int t = 1; t <= T; t++) {
            numprocessed++;

			/* Sample training example */
            int r = random() % data.size(); 

            /* Compute SVM subgradient and take gradient step*/
            tau = 0.5 * (-lambda*t - tausum + sqrt((lambda*t + tausum)*(lambda*t + tausum) + GR_mult*(G*G)/(R*R)));
            tausum += tau;
            double eta = 1.0 / (lambda*t + tausum);

            /* Regularization gradient */
            w.scale(1 - eta*(lambda + (*reg_add)));

            /* Hinge-loss subgradient */
            if (labels[r] * w.dot_product(data[r]) < 1) {
                w.add_scaled(eta*labels[r], data[r]);
            }

            /* Projection */
            if (w.norm() > 1/sqrt(lambda)) {
                w.scale(1/(w.norm() * sqrt(lambda)));
            }
			
			if (numprocessed >= steps) goto done;

            if (w.norm() >= R - sqrt(2*epsilon / lambda)) {
                //cout << "np = "<< numprocessed << "  R=" << R << "  eta=" << eta << "  tausum=" << tausum << endl;
                R *= sqrt(2.0);
                converged = false;
                break;
            }

        }
	}

done:
    ;
	cout << "After " << steps << " iterations (" << (double)numprocessed / data.size() << " effective iterations):" << endl;
	cout << "  Regularized loss=" << compute_svm_loss(w, data, labels, lambda) << endl;
	cout << "  Norm of weight vector=" << w.norm() << endl;
	write_model(output_filename, w, lambda);
}

void print_usage() {
    cerr << "Optimistic Online Proximal SVM-solver" << endl;
	cerr << "Written by Chuan-Sheng Foo" << endl;
    cerr << "-------------------------------------" << endl;
    cerr << "Usage:" << endl;
    cerr << "  oops <method> <input_file> <model_file> <l2-weight> <iters>" << endl << endl;
    cerr << "where:" << endl;
    cerr << "  method is one of [ pegasos | proximal | adaptive | opt-proximal | opt-adaptive ]" << endl;
    cerr << "  l2-weight is the regularization parameter lambda" << endl;
    cerr << "  iters is the number of training examples to process before stopping" << endl;
}

int main(int argc, char** argv) {
    if (argc < 6) {
        print_usage();
        return -1;
    }

    solver_type solver = UNKNOWN;

    if (strcmp(argv[1], "proximal") == 0) {
        solver = PROXIMAL;
    } else if (strcmp(argv[1], "adaptive") == 0) {
        solver = ADAPTIVE;
    } else if (strcmp(argv[1], "pegasos") == 0) {
        solver = PEGASOS;
    } else if (strcmp(argv[1], "opt-adaptive") == 0) {
        solver = OPTIMISTIC_ADAPTIVE;
    } else if (strcmp(argv[1], "opt-proximal") == 0) {
        solver = OPTIMISTIC_PROXIMAL;
    } else {
        cerr << "Unknown solver type requested: " << argv[1] << endl;
        cerr << "Valid options are [ pegasos | proximal | adaptive | opt-proximal | opt-adaptive ]" << endl;
        return -1;
    }

    // parse params and print
    string input_filename = argv[2];
    string output_filename = argv[3];
    double lambda = atof(argv[4]);
    int iters = atoi(argv[5]);
    const double epsilon = 0;
    const double delta = 1;

    cout << "oops called with parameters: " << endl;
    cout << "  solver:      " << argv[1] << endl;
    cout << "  input file:  " << input_filename << endl;
    cout << "  model file: " << output_filename << endl;
    cout << "  lambda:      " << lambda << endl;
    cout << "  iters:      " << iters << endl;
	
    vector<SparseVector> data;
	vector<int> labels;
	int dim;
    
    cout << "Reading input...";
    read_file(input_filename, data, labels, dim);
    
    switch (solver) {
    case OPTIMISTIC_ADAPTIVE:
    case OPTIMISTIC_PROXIMAL:
        train_optimistic_generic_fast(output_filename, data, labels, dim, lambda, epsilon, delta, iters, solver);
        break;
    case ADAPTIVE:
    case PEGASOS:
	case PROXIMAL:
        train_generic_fast(output_filename, data, labels, dim, lambda, iters, solver);
        break;
	default:
		cout << "Unknown solver type! (probably a bug in the code)" << endl;
		break;
    }
    return 0;
}
