/*################################################################################
  ##
  ##   Copyright (C) 2016-2020 Keith O'Hara
  ##   Modified work Copyright (c) 2021-2022 Fabio Sigrist. All rights reserved.
  ##
  ##   This file is part of the OptimLib C++ library.
  ##
  ##   Licensed under the Apache License, Version 2.0 (the "License");
  ##   you may not use this file except in compliance with the License.
  ##   You may obtain a copy of the License at
  ##
  ##       http://www.apache.org/licenses/LICENSE-2.0
  ##
  ##   Unless required by applicable law or agreed to in writing, software
  ##   distributed under the License is distributed on an "AS IS" BASIS,
  ##   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ##   See the License for the specific language governing permissions and
  ##   limitations under the License.
  ##
  ################################################################################*/

/*
 * BFGS method for quasi-Newton-based non-linear optimization
 */

#ifndef _optim_bfgs_HPP
#define _optim_bfgs_HPP

/**
 * @brief The Broyden–Fletcher–Goldfarb–Shanno (BFGS) Quasi-Newton Optimization Algorithm
 *
 * @param init_out_vals a column vector of initial values, which will be replaced by the solution upon successful completion of the optimization algorithm.
 * @param opt_objfn the function to be minimized, taking three arguments:
 *   - \c vals_inp a vector of inputs;
 *   - \c grad_out a vector to store the gradient; and
 *   - \c opt_data additional data passed to the user-provided function.
 * @param opt_data additional data passed to the user-provided function.
 *
 * @return a boolean value indicating successful completion of the optimization algorithm.
 */

bool 
bfgs(Vec_t& init_out_vals, 
     std::function<double (const Vec_t& vals_inp, Vec_t* grad_out, void* opt_data)> opt_objfn, 
     void* opt_data);

/**
 * @brief The Broyden–Fletcher–Goldfarb–Shanno (BFGS) Quasi-Newton Optimization Algorithm
 *
 * @param init_out_vals a column vector of initial values, which will be replaced by the solution upon successful completion of the optimization algorithm.
 * @param opt_objfn the function to be minimized, taking three arguments:
 *   - \c vals_inp a vector of inputs;
 *   - \c grad_out a vector to store the gradient; and
 *   - \c opt_data additional data passed to the user-provided function.
 * @param opt_data additional data passed to the user-provided function.
 * @param settings parameters controlling the optimization routine.
 *
 * @return a boolean value indicating successful completion of the optimization algorithm.
 */

bool 
bfgs(Vec_t& init_out_vals, 
     std::function<double (const Vec_t& vals_inp, Vec_t* grad_out, void* opt_data)> opt_objfn, 
     void* opt_data, 
     algo_settings_t& settings);

//
// internal

namespace internal
{

bool 
bfgs_impl(Vec_t& init_out_vals, 
         std::function<double (const Vec_t& vals_inp, Vec_t* grad_out, void* opt_data)> opt_objfn, 
         void* opt_data, 
         algo_settings_t* settings_inp);

}

//

inline
bool
internal::bfgs_impl(
    Vec_t& init_out_vals, 
    std::function<double (const Vec_t& vals_inp, Vec_t* grad_out, void* opt_data)> opt_objfn, 
    void* opt_data, 
    algo_settings_t* settings_inp)
{
    // notation: 'p' stands for '+1'.

    bool success = false;

    const size_t n_vals = OPTIM_MATOPS_SIZE(init_out_vals);

    //
    // BFGS settings

    algo_settings_t settings;

    if (settings_inp) {
        settings = *settings_inp;
    }

    //ChangedForGPBoost
    //const int print_level = settings.print_level;
    
    const uint_t conv_failure_switch = settings.conv_failure_switch;

    const size_t iter_max = settings.iter_max;
    const double grad_err_tol = settings.grad_err_tol;
    const double rel_sol_change_tol = settings.rel_sol_change_tol;

    const double wolfe_cons_1 = settings.bfgs_settings.wolfe_cons_1; // line search tuning parameter
    const double wolfe_cons_2 = settings.bfgs_settings.wolfe_cons_2; // line search tuning parameter

    const bool vals_bound = settings.vals_bound;
    
    const Vec_t lower_bounds = settings.lower_bounds;
    const Vec_t upper_bounds = settings.upper_bounds;

    const VecInt_t bounds_type = determine_bounds_type(vals_bound, n_vals, lower_bounds, upper_bounds);

    // lambda function for box constraints

    std::function<double (const Vec_t& vals_inp, Vec_t* grad_out, void* box_data)> box_objfn \
    = [opt_objfn, vals_bound, bounds_type, lower_bounds, upper_bounds] (const Vec_t& vals_inp, Vec_t* grad_out, void* opt_data) \
    -> double
    {
        if (vals_bound) {
            Vec_t vals_inv_trans = inv_transform(vals_inp, bounds_type, lower_bounds, upper_bounds);
            double ret;
            
            if (grad_out) {
                Vec_t grad_obj = *grad_out;

                ret = opt_objfn(vals_inv_trans,&grad_obj,opt_data);

                // Mat_t jacob_matrix = jacobian_adjust(vals_inp,bounds_type,lower_bounds,upper_bounds);
                Vec_t jacob_vec = OPTIM_MATOPS_EXTRACT_DIAG( jacobian_adjust(vals_inp, bounds_type, lower_bounds, upper_bounds) );

                // *grad_out = jacob_matrix * grad_obj; // no need for transpose as jacob_matrix is diagonal
                *grad_out = OPTIM_MATOPS_HADAMARD_PROD(jacob_vec, grad_obj);
            } else {
                ret = opt_objfn(vals_inv_trans,nullptr,opt_data);
            }

            return ret;
        } else {
            return opt_objfn(vals_inp,grad_out,opt_data);
        }
    };

    // initialization

    Vec_t x = init_out_vals;

    if (! OPTIM_MATOPS_IS_FINITE(x) ) {
        //ChangedForGPBoost
        //printf("bfgs error: non-finite initial value(s).\n");
        Log::REWarning("bfgs error: non-finite initial value(s).\n");
        return false;
    }

    if (vals_bound) { // should we transform the parameters?
        x = transform(x, bounds_type, lower_bounds, upper_bounds);
    }

    const Mat_t I_mat = OPTIM_MATOPS_EYE(n_vals);

    Mat_t W = I_mat;                            // initial approx. to (inverse) Hessian 
    Vec_t grad(n_vals);                         // gradient vector
    Vec_t d = OPTIM_MATOPS_ZERO_VEC(n_vals);    // direction vector
    Vec_t s = OPTIM_MATOPS_ZERO_VEC(n_vals);
    Vec_t y = OPTIM_MATOPS_ZERO_VEC(n_vals);

    box_objfn(x, &grad, opt_data);

    double grad_err = OPTIM_MATOPS_L2NORM(grad);

    //ChangedForGPBoost
    //OPTIM_BFGS_TRACE(-1, grad_err, 0.0, x, d, grad, s, y, W);

    if (grad_err <= grad_err_tol) {
        return true;
    }

    // if ||gradient(initial values)|| > tolerance, continue

    d = - W*grad; // direction

    Vec_t x_p = x, grad_p = grad;

    line_search_mt(1.0, x_p, grad_p, d, &wolfe_cons_1, &wolfe_cons_2, box_objfn, opt_data);

    s = x_p - x;
    y = grad_p - grad;

    // update approx. inverse Hessian (W)

    double W_denom_term = OPTIM_MATOPS_DOT_PROD(y,s);
    Mat_t W_term_1;

    if (W_denom_term > 1E-10) {   
        // checking whether the curvature condition holds: y's > 0
        W_term_1 = I_mat - s * (OPTIM_MATOPS_TRANSPOSE_IN_PLACE(y)) / W_denom_term;
    
        // perform rank-1 update of inverse Hessian approximation
        W = W_term_1 * W * (OPTIM_MATOPS_TRANSPOSE_IN_PLACE(W_term_1)) + s * (OPTIM_MATOPS_TRANSPOSE_IN_PLACE(s)) / W_denom_term;
    } else {
        W = 0.1 * W;
    }

    grad = grad_p;

    grad_err = OPTIM_MATOPS_L2NORM(grad_p);
    double rel_sol_change = OPTIM_MATOPS_L1NORM( OPTIM_MATOPS_ARRAY_DIV_ARRAY(s, (OPTIM_MATOPS_ARRAY_ADD_SCALAR(OPTIM_MATOPS_ABS(x), 1.0e-08)) ) );

    //ChangedForGPBoost
    //OPTIM_BFGS_TRACE(0, grad_err, rel_sol_change, x_p, d, grad_p, s, y, W);

    if (grad_err <= grad_err_tol) {
        init_out_vals = x_p;
        return true;
    }

    // begin loop

    size_t iter = 0;

    bool has_converged = false;
    while (!has_converged) {
        ++iter;

        //

        d = - W*grad;

        line_search_mt(1.0, x_p, grad_p, d, &wolfe_cons_1, &wolfe_cons_2, box_objfn, opt_data);

        //

        s = x_p - x;
        y = grad_p - grad;

        W_denom_term = OPTIM_MATOPS_DOT_PROD(y,s);

        if (W_denom_term > 1E-10) {
            // checking the curvature condition y.s > 0
            W_term_1 = I_mat - s * OPTIM_MATOPS_TRANSPOSE_IN_PLACE(y) / W_denom_term;
        
            W = W_term_1 * W * OPTIM_MATOPS_TRANSPOSE_IN_PLACE(W_term_1) + s * OPTIM_MATOPS_TRANSPOSE_IN_PLACE(s) / W_denom_term;
        }

        grad_err = OPTIM_MATOPS_L2NORM(grad_p);
        rel_sol_change = OPTIM_MATOPS_L1NORM( OPTIM_MATOPS_ARRAY_DIV_ARRAY(s, (OPTIM_MATOPS_ARRAY_ADD_SCALAR(OPTIM_MATOPS_ABS(x), 1.0e-08)) ) );
        
        has_converged = !(grad_err > grad_err_tol && rel_sol_change > rel_sol_change_tol && iter < iter_max);
    
        //ChangedForGPBoost
        //OPTIM_BFGS_TRACE(iter, grad_err, rel_sol_change, x, d, grad, s, y, W);
        if (settings_inp) {
            settings_inp->opt_iter = iter - 1;
        }
        //redetermine neighbors for the Vecchia approximation if applicable
        Vec_t gradient_dummy(2);//"hack" for redermininig neighbors for the Vecchia approximation
        gradient_dummy[0] = 1.00000000001e30;
        gradient_dummy[1] = -1.00000000001e30;
        if (has_converged) {
            gradient_dummy[2] = 1.00000000001e30;//hack to force redetermination of nearest neighbors
        }
        else {
            gradient_dummy[2] = -1.00000000001e30;
        }
        double neighbors_have_been_redetermined = opt_objfn(x, &gradient_dummy, opt_data);
        //recalculated gradient of objective function if neighbors have been redetermined and check convergence again
        if (neighbors_have_been_redetermined >= 1e30 && neighbors_have_been_redetermined <= 1.00000000002e30) {//hack that indicates that the neighbors have been redetermined
            box_objfn(x_p, &grad_p, opt_data);
            grad_err = OPTIM_MATOPS_L2NORM(grad_p);
            has_converged = !(grad_err > grad_err_tol && rel_sol_change > rel_sol_change_tol && iter < iter_max);
        }

        x = x_p;
        grad = grad_p;

        //print trace information
        if ((iter < 10 || (iter % 10 == 0 && iter < 100) || (iter % 100 == 0 && iter < 1000) ||
            (iter % 1000 == 0 && iter < 10000) || (iter % 10000 == 0)) && (iter != iter_max)) {
            gradient_dummy[0] = -1.00000000001e30;//"hack" for printing nice logging information
            gradient_dummy[1] = 1.00000000001e30;
            opt_objfn(x, &gradient_dummy, opt_data);
        }
    }

    //

    if (vals_bound) {
        x_p = inv_transform(x_p, bounds_type, lower_bounds, upper_bounds);
    }

    error_reporting(init_out_vals, x_p, opt_objfn, opt_data, 
                    success, grad_err, grad_err_tol, iter, iter_max, 
                    conv_failure_switch, settings_inp);

    //
    
    return success;
}

inline
bool
bfgs(
    Vec_t& init_out_vals, 
    std::function<double (const Vec_t& vals_inp, Vec_t* grad_out, void* opt_data)> opt_objfn, 
    void* opt_data)
{
    return internal::bfgs_impl(init_out_vals, opt_objfn, opt_data, nullptr);
}

inline
bool
bfgs(
    Vec_t& init_out_vals, 
    std::function<double (const Vec_t& vals_inp, Vec_t* grad_out, void* opt_data)> opt_objfn, 
    void* opt_data, 
    algo_settings_t& settings)
{
    return internal::bfgs_impl(init_out_vals, opt_objfn, opt_data, &settings);
}

#endif
