SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
SingleLaplacianInferenceMethod.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Roman Votyakov
8  * Copyright (C) 2012 Jacob Walker
9  * Copyright (C) 2013 Roman Votyakov
10  *
11  * Code adapted from Gaussian Process Machine Learning Toolbox
12  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
13  * This code specifically adapted from infLaplace.m
14  */
15 
17 
18 #ifdef HAVE_EIGEN3
19 
22 #include <shogun/lib/external/brent.h>
24 
25 using namespace shogun;
26 using namespace Eigen;
27 
28 namespace shogun
29 {
30 
31 #ifndef DOXYGEN_SHOULD_SKIP_THIS
32 
34 class CPsiLine : public func_base
35 {
36 public:
38  MatrixXd K;
39  VectorXd dalpha;
40  VectorXd start_alpha;
41  Map<VectorXd>* alpha;
46  CLikelihoodModel* lik;
47  CLabels* lab;
48 
49  virtual double operator() (double x)
50  {
51  Map<VectorXd> eigen_f(f->vector, f->vlen);
52  Map<VectorXd> eigen_m(m->vector, m->vlen);
53 
54  // compute alpha=alpha+x*dalpha and f=K*alpha+m
55  (*alpha)=start_alpha+x*dalpha;
56  eigen_f=K*(*alpha)*CMath::sq(scale)+eigen_m;
57 
58  // get first and second derivatives of log likelihood
59  (*dlp)=lik->get_log_probability_derivative_f(lab, (*f), 1);
60 
61  (*W)=lik->get_log_probability_derivative_f(lab, (*f), 2);
62  W->scale(-1.0);
63 
64  // compute psi=alpha'*(f-m)/2-lp
65  float64_t result = (*alpha).dot(eigen_f-eigen_m)/2.0-
67 
68  return result;
69  }
70 };
71 
72 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
73 
75 {
76  init();
77 }
78 
80  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
81  : CLaplacianInferenceBase(kern, feat, m, lab, mod)
82 {
83  init();
84 }
85 
86 void CSingleLaplacianInferenceMethod::init()
87 {
88  SG_ADD(&m_sW, "sW", "square root of W", MS_NOT_AVAILABLE);
89  SG_ADD(&m_d2lp, "d2lp", "second derivative of log likelihood with respect to function location", MS_NOT_AVAILABLE);
90  SG_ADD(&m_d3lp, "d3lp", "third derivative of log likelihood with respect to function location", MS_NOT_AVAILABLE);
91 }
92 
94 {
96  update();
97 
98  return SGVector<float64_t>(m_sW);
99 
100 }
101 
103 {
104 }
105 
107 {
109  update();
110 
111  // create eigen representations alpha, f, W, L
112  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
113  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
114  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
116 
117  // get mean vector and create eigen representation of it
119  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
120 
121  // get log likelihood
123  m_mu));
124 
125  float64_t result;
126 
127  if (eigen_W.minCoeff()<0)
128  {
129  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
131 
132  FullPivLU<MatrixXd> lu(MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols)+
133  eigen_ktrtr*CMath::sq(m_scale)*eigen_sW.asDiagonal());
134 
135  result=(eigen_alpha.dot(eigen_mu-eigen_mean))/2.0-
136  lp+log(lu.determinant())/2.0;
137  }
138  else
139  {
140  result=eigen_alpha.dot(eigen_mu-eigen_mean)/2.0-lp+
141  eigen_L.diagonal().array().log().sum();
142  }
143 
144  return result;
145 }
146 
148 {
151  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
152 
155 
156  // compute V = L^(-1) * W^(1/2) * K, using upper triangular factor L^T
157  MatrixXd eigen_V=eigen_L.triangularView<Upper>().adjoint().solve(
158  eigen_sW.asDiagonal()*eigen_K*CMath::sq(m_scale));
159 
160  // compute covariance matrix of the posterior:
161  // Sigma = K - K * W^(1/2) * (L * L^T)^(-1) * W^(1/2) * K =
162  // K - (K * W^(1/2)) * (L^T)^(-1) * L^(-1) * W^(1/2) * K =
163  // K - (W^(1/2) * K)^T * (L^(-1))^T * L^(-1) * W^(1/2) * K = K - V^T * V
164  eigen_Sigma=eigen_K*CMath::sq(m_scale)-eigen_V.adjoint()*eigen_V;
165 }
166 
168 {
169  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
170 
171  // create eigen representation of kernel matrix
173 
174  // create shogun and eigen representation of posterior cholesky
177 
178  if (eigen_W.minCoeff() < 0)
179  {
180  // compute inverse of diagonal noise: iW = 1/W
181  VectorXd eigen_iW = (VectorXd::Ones(m_W.vlen)).cwiseQuotient(eigen_W);
182 
183  FullPivLU<MatrixXd> lu(
184  eigen_ktrtr*CMath::sq(m_scale)+MatrixXd(eigen_iW.asDiagonal()));
185 
186  // compute cholesky: L = -(K + iW)^-1
187  eigen_L = -lu.inverse();
188  }
189  else
190  {
191  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
192 
193  // compute cholesky: L = chol(sW * sW' .* K + I)
194  LLT<MatrixXd> L(
195  (eigen_sW*eigen_sW.transpose()).cwiseProduct(eigen_ktrtr*CMath::sq(m_scale))+
196  MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols));
197 
198  eigen_L = L.matrixU();
199  }
200 }
201 
203 {
204  float64_t Psi_Old = CMath::INFTY;
205  float64_t Psi_New;
206  float64_t Psi_Def;
207 
208  // get mean vector and create eigen representation of it
210  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
211 
212  // create eigen representation of kernel matrix
214 
215  // create shogun and eigen representation of function vector
217  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
218 
220  {
221  // set alpha a zero vector
223  m_alpha.zero();
224 
225  // f = mean, if length of alpha and length of y doesn't match
226  eigen_mu=eigen_mean;
227 
228  // compute W = -d2lp
230  m_W.scale(-1.0);
231 
233  m_labels, m_mu));
234  }
235  else
236  {
237  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
238 
239  // compute f = K * alpha + m
240  eigen_mu=eigen_ktrtr*CMath::sq(m_scale)*eigen_alpha+eigen_mean;
241 
242  // compute W = -d2lp
244  m_W.scale(-1.0);
245 
246  Psi_New=eigen_alpha.dot(eigen_mu-eigen_mean)/2.0-
248 
250 
251  // if default is better, then use it
252  if (Psi_Def < Psi_New)
253  {
254  m_alpha.zero();
255  eigen_mu=eigen_mean;
257  m_labels, m_mu));
258  }
259  }
260 
261  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
262 
263  // get first derivative of log probability function
265 
266  // create shogun and eigen representation of sW
268  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
269 
270  index_t iter=0;
271 
272  while (Psi_Old-Psi_New>m_tolerance && iter<m_iter)
273  {
274  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
275  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
276 
277  Psi_Old = Psi_New;
278  iter++;
279 
280  if (eigen_W.minCoeff() < 0)
281  {
282  // Suggested by Vanhatalo et. al.,
283  // Gaussian Process Regression with Student's t likelihood, NIPS 2009
284  // Quoted from infLaplace.m
285  float64_t df;
286 
288  {
290  df=lik->get_degrees_freedom();
291  SG_UNREF(lik);
292  }
293  else
294  df=1;
295 
296  eigen_W+=(2.0/df)*eigen_dlp.cwiseProduct(eigen_dlp);
297  }
298 
299  // compute sW = sqrt(W)
300  eigen_sW=eigen_W.cwiseSqrt();
301 
302  LLT<MatrixXd> L((eigen_sW*eigen_sW.transpose()).cwiseProduct(eigen_ktrtr*CMath::sq(m_scale))+
303  MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols));
304 
305  VectorXd b=eigen_W.cwiseProduct(eigen_mu - eigen_mean)+eigen_dlp;
306 
307  VectorXd dalpha=b-eigen_sW.cwiseProduct(
308  L.solve(eigen_sW.cwiseProduct(eigen_ktrtr*b*CMath::sq(m_scale))))-eigen_alpha;
309 
310  // perform Brent's optimization
311  CPsiLine func;
312 
313  func.scale=m_scale;
314  func.K=eigen_ktrtr;
315  func.dalpha=dalpha;
316  func.start_alpha=eigen_alpha;
317  func.alpha=&eigen_alpha;
318  func.dlp=&m_dlp;
319  func.f=&m_mu;
320  func.m=&mean;
321  func.W=&m_W;
322  func.lik=m_model;
323  func.lab=m_labels;
324 
325  float64_t x;
326  Psi_New=local_min(0, m_opt_max, m_opt_tolerance, func, x);
327  }
328 
329  // compute f = K * alpha + m
330  eigen_mu=eigen_ktrtr*CMath::sq(m_scale)*eigen_alpha+eigen_mean;
331 
332  // get log probability derivatives
336 
337  // W = -d2lp
338  m_W=m_d2lp.clone();
339  m_W.scale(-1.0);
340 
341  // compute sW
342  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
343 
344  if (eigen_W.minCoeff()>0)
345  eigen_sW=eigen_W.cwiseSqrt();
346  else
347  eigen_sW.setZero();
348 }
349 
351 {
352  // create eigen representation of W, sW, dlp, d3lp, K, alpha and L
353  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
354  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
355  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
356  Map<VectorXd> eigen_d3lp(m_d3lp.vector, m_d3lp.vlen);
358  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
360 
361  // create shogun and eigen representation of matrix Z
364 
365  // create shogun and eigen representation of the vector g
367  Map<VectorXd> eigen_g(m_g.vector, m_g.vlen);
368 
369  if (eigen_W.minCoeff()<0)
370  {
371  eigen_Z=-eigen_L;
372 
373  // compute iA = (I + K * diag(W))^-1
374  FullPivLU<MatrixXd> lu(MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols)+
375  eigen_K*CMath::sq(m_scale)*eigen_W.asDiagonal());
376  MatrixXd iA=lu.inverse();
377 
378  // compute derivative ln|L'*L| wrt W: g=sum(iA.*K,2)/2
379  eigen_g=(iA.cwiseProduct(eigen_K*CMath::sq(m_scale))).rowwise().sum()/2.0;
380  }
381  else
382  {
383  // solve L'*L*Z=diag(sW) and compute Z=diag(sW)*Z
384  eigen_Z=eigen_L.triangularView<Upper>().adjoint().solve(
385  MatrixXd(eigen_sW.asDiagonal()));
386  eigen_Z=eigen_L.triangularView<Upper>().solve(eigen_Z);
387  eigen_Z=eigen_sW.asDiagonal()*eigen_Z;
388 
389  // solve L'*C=diag(sW)*K
390  MatrixXd C=eigen_L.triangularView<Upper>().adjoint().solve(
391  eigen_sW.asDiagonal()*eigen_K*CMath::sq(m_scale));
392 
393  // compute derivative ln|L'*L| wrt W: g=(diag(K)-sum(C.^2,1)')/2
394  eigen_g=(eigen_K.diagonal()*CMath::sq(m_scale)-
395  (C.cwiseProduct(C)).colwise().sum().adjoint())/2.0;
396  }
397 
398  // create shogun and eigen representation of the vector dfhat
400  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
401 
402  // compute derivative of nlZ wrt fhat
403  eigen_dfhat=eigen_g.cwiseProduct(eigen_d3lp);
404 }
405 
407  const TParameter* param)
408 {
409  REQUIRE(!strcmp(param->m_name, "scale"), "Can't compute derivative of "
410  "the nagative log marginal likelihood wrt %s.%s parameter\n",
411  get_name(), param->m_name)
412 
413  // create eigen representation of K, Z, dfhat, dlp and alpha
416  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
417  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
418  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
419 
420  SGVector<float64_t> result(1);
421 
422  // compute derivative K wrt scale
423  MatrixXd dK=eigen_K*m_scale*2.0;
424 
425  // compute dnlZ=sum(sum(Z.*dK))/2-alpha'*dK*alpha/2
426  result[0]=(eigen_Z.cwiseProduct(dK)).sum()/2.0-
427  (eigen_alpha.adjoint()*dK).dot(eigen_alpha)/2.0;
428 
429  // compute b=dK*dlp
430  VectorXd b=dK*eigen_dlp;
431 
432  // compute dnlZ=dnlZ-dfhat'*(b-K*(Z*b))
433  result[0]=result[0]-eigen_dfhat.dot(b-eigen_K*CMath::sq(m_scale)*(eigen_Z*b));
434 
435  return result;
436 }
437 
439  const TParameter* param)
440 {
441  // create eigen representation of K, Z, g and dfhat
444  Map<VectorXd> eigen_g(m_g.vector, m_g.vlen);
445  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
446 
447  // get derivatives wrt likelihood model parameters
449  m_mu, param);
451  m_mu, param);
453  m_mu, param);
454 
455  // create eigen representation of the derivatives
456  Map<VectorXd> eigen_lp_dhyp(lp_dhyp.vector, lp_dhyp.vlen);
457  Map<VectorXd> eigen_dlp_dhyp(dlp_dhyp.vector, dlp_dhyp.vlen);
458  Map<VectorXd> eigen_d2lp_dhyp(d2lp_dhyp.vector, d2lp_dhyp.vlen);
459 
460  SGVector<float64_t> result(1);
461 
462  // compute b vector
463  VectorXd b=eigen_K*eigen_dlp_dhyp;
464 
465  // compute dnlZ=-g'*d2lp_dhyp-sum(lp_dhyp)-dfhat'*(b-K*(Z*b))
466  result[0]=-eigen_g.dot(eigen_d2lp_dhyp)-eigen_lp_dhyp.sum()-
467  eigen_dfhat.dot(b-eigen_K*CMath::sq(m_scale)*(eigen_Z*b));
468 
469  return result;
470 }
471 
473  const TParameter* param)
474 {
475  // create eigen representation of K, Z, dfhat, dlp and alpha
478  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
479  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
480  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
481 
482  SGVector<float64_t> result;
483 
484  if (param->m_datatype.m_ctype==CT_VECTOR ||
485  param->m_datatype.m_ctype==CT_SGVECTOR)
486  {
488  "Length of the parameter %s should not be NULL\n", param->m_name)
489  result=SGVector<float64_t>(*(param->m_datatype.m_length_y));
490  }
491  else
492  {
493  result=SGVector<float64_t>(1);
494  }
495 
496  for (index_t i=0; i<result.vlen; i++)
497  {
499 
500  if (result.vlen==1)
501  dK=m_kernel->get_parameter_gradient(param);
502  else
503  dK=m_kernel->get_parameter_gradient(param, i);
504 
505  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_cols, dK.num_rows);
506 
507  // compute dnlZ=sum(sum(Z.*dK))/2-alpha'*dK*alpha/2
508  result[i]=(eigen_Z.cwiseProduct(eigen_dK)).sum()/2.0-
509  (eigen_alpha.adjoint()*eigen_dK).dot(eigen_alpha)/2.0;
510 
511  // compute b=dK*dlp
512  VectorXd b=eigen_dK*eigen_dlp;
513 
514  // compute dnlZ=dnlZ-dfhat'*(b-K*(Z*b))
515  result[i]=result[i]-eigen_dfhat.dot(b-eigen_K*CMath::sq(m_scale)*
516  (eigen_Z*b));
517  }
518 
519  return result;
520 }
521 
523  const TParameter* param)
524 {
525  // create eigen representation of K, Z, dfhat and alpha
528  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
529  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
530 
531  SGVector<float64_t> result;
532 
533  if (param->m_datatype.m_ctype==CT_VECTOR ||
534  param->m_datatype.m_ctype==CT_SGVECTOR)
535  {
537  "Length of the parameter %s should not be NULL\n", param->m_name)
538 
539  result=SGVector<float64_t>(*(param->m_datatype.m_length_y));
540  }
541  else
542  {
543  result=SGVector<float64_t>(1);
544  }
545 
546  for (index_t i=0; i<result.vlen; i++)
547  {
549 
550  if (result.vlen==1)
552  else
554 
555  Map<VectorXd> eigen_dmu(dmu.vector, dmu.vlen);
556 
557  // compute dnlZ=-alpha'*dm-dfhat'*(dm-K*(Z*dm))
558  result[i]=-eigen_alpha.dot(eigen_dmu)-eigen_dfhat.dot(eigen_dmu-
559  eigen_K*CMath::sq(m_scale)*(eigen_Z*eigen_dmu));
560  }
561 
562  return result;
563 }
564 }
565 
566 #endif /* HAVE_EIGEN3 */
virtual SGVector< float64_t > get_log_probability_f(const CLabels *lab, SGVector< float64_t > func) const =0
SGVector< float64_t > m_alpha
int32_t index_t
Definition: common.h:62
Vector::Scalar dot(Vector a, Vector b)
Definition: Redux.h:56
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:1537
virtual SGVector< float64_t > get_second_derivative(const CLabels *lab, SGVector< float64_t > func, const TParameter *param) const
virtual int32_t get_num_labels() const =0
static T sq(T x)
x^2
Definition: Math.h:395
Definition: SGMatrix.h:20
virtual ELikelihoodModelType get_model_type() const
parameter struct
Definition: Parameter.h:32
#define REQUIRE(x,...)
Definition: SGIO.h:206
The Laplace approximation inference method base class.
index_t num_cols
Definition: SGMatrix.h:331
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
An abstract class of the mean function.
Definition: MeanFunction.h:28
void scale(T alpha)
Scale vector inplace.
Definition: SGVector.cpp:956
index_t num_rows
Definition: SGMatrix.h:329
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
TSGDataType m_datatype
Definition: Parameter.h:165
index_t vlen
Definition: SGVector.h:637
SGMatrix< float64_t > m_L
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
double float64_t
Definition: common.h:50
Matrix::Scalar sum(Matrix m, bool no_diag=false)
Definition: Redux.h:70
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:494
Matrix< float64_t,-1,-1, 0,-1,-1 > MatrixXd
index_t * m_length_y
Definition: DataType.h:78
Class that models a Student's-t likelihood.
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:52
EContainerType m_ctype
Definition: DataType.h:71
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class Features is the base class of all feature objects.
Definition: Features.h:68
void scale(Matrix A, Matrix B, typename Matrix::Scalar alpha)
Definition: Core.h:82
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:732
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
SGVector< T > clone() const
Definition: SGVector.cpp:278
virtual SGVector< float64_t > get_log_probability_derivative_f(const CLabels *lab, SGVector< float64_t > func, index_t i) const =0
static void inverse(SGMatrix< float64_t > matrix)
inverses square matrix in-place
Definition: SGMatrix.cpp:861
The Kernel base class.
Definition: Kernel.h:153
virtual SGVector< float64_t > get_third_derivative(const CLabels *lab, SGVector< float64_t > func, const TParameter *param) const
#define SG_ADD(...)
Definition: SGObject.h:81
static CStudentsTLikelihood * obtain_from_generic(CLikelihoodModel *likelihood)
virtual SGVector< float64_t > get_first_derivative(const CLabels *lab, SGVector< float64_t > func, const TParameter *param) const
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:263
The Likelihood model base class.
SGMatrix< float64_t > m_ktrtr
CLikelihoodModel * m_model

SHOGUN Machine Learning Toolbox - Documentation