Source code for itcsimlib.itc_fit

"""Provides model parameter optimization to fit experimental data.


"""

import random
import numpy
import scipy.optimize

from collections import OrderedDict

from .thermo	import *
from .utilities	import *


[docs]class ITCFit:
	"""A class for optimizing model parameters to accurately fit experimental data.

	Attributes
	----------
	model : ITCModel
		The model to use for fitting the experimental data.
	bounds : dict of tuples
		A parameter-name keyed dict of low and high bounds to enforce during fitting (retrieved from the model itself, or explicitly provided by the user).
	chisq : float
		The most recently evaluated goodness-of-fit chisquare.
	"""

	def __init__(self, sim, method='simplex', method_args={}, verbose=False):
		"""Constructor function for the ITCFit object.

		Arguments
		---------
		sim : ITCSim
			The ITCSim object that possesses the experimental data to replicate.
		bounds : dict of tuples
			A parameter-name keyed dict of low and high bounds to enforce during fitting.
		method : string
			The optimization algorithm to use.
		method_args : dict
			Arguments to pass to the optimization algorithm (method specific).
		verbose : boolean
			Print additional information to the console?
		"""

		self.sim = sim
		self.model	= self.sim.model
		self.method	= method
		self.method_args = method_args
		self.verbose = verbose
		self.chisq = 0.0

		# obtain model-defined boundaries to enforce during fitting
		self.bounds = dict( (name,self.model.get_param_bounds(name)) for name in self.model.get_param_names() )
		
	def _noisy_bisect(self, f, a, b, fa, fb, tolerance):
		# From http://www.johndcook.com/blog/2012/06/14/root-finding-with-noisy-functions/
		# Assume a < b, fa = f(a) < 0, fb = f(b) > 0.
		if b-a < tolerance:
			return (a, b)
		mid = 0.5*(a+b)
		fmid = f(mid)
		if fmid < fa or fmid > fb: # Monotonicity violated. Reached resolution of noise.
			return (a, b)
		if fmid < 0:
			a, fa = mid, fmid
		else:
			b, fb = mid, fmid
		return self._noisy_bisect(f, a, b, fa, fb, tolerance)
		
[docs]	def set_sim(self, sim):
		"""Sets the simulator to use for fitting."""
		self.sim = sim
		self.model = self.sim.model
		
[docs]	def get_sim(self):
		"""Returns the simulator used for fitting."""
		return self.sim
	
[docs]	def add_bounds(self, param, low=None, high=None):
		"""Add low and/or high boundaries for the specified parameter.

		Arguments
		---------
		param : string
			The name of the parameter to create the boundary for
		low : float
			The lower boundary for the parameter, or None for no boundary
		high : float
			The upper boundary for the parameter, or None for no boundary

		Returns
		-------
		None
		
		Notes
		-----
			Implementation of parameter boundaries is fit method dependent and can be unpredictable. Use at your own risk, and sparingly.
		"""

		assert param in self.model.get_param_names()
		self.bounds[param] = (low,high)

[docs]	def optimize(self, params=[], callback=None, update_fits=False, use_bounds=True ):
		"""Optimize the specified parameters.

		Arguments
		---------
		params : dict of strings
			The names of the model parameters to optimize.
		callback : function
			A callback function to call at each optimization step, will be passed a vector of the current parameter values.
		update_fits : boolean
			Update the dQ_fit attributes of the experiments in the simulator object?

		Returns
		_______
		(dict,float)
			A parameter-name keyed dict of optimized values, and the corresponding goodness-of-fit.
		"""

		if self.verbose:
			print("\nitc_fit: Optimizing %s parameters using the %s algorithm\n"%(",".join(params),self.method))
			def _printer(x):
				print("itc_fit: %s (%f)" %(" ".join(map(str,x)),self.sim.chisq))
		else:
			def _printer(x):
				return

		# starting parameter values to restore later
		start_params = self.sim.get_model_params().copy()
		
		# initial param guesses as list
		x0 = [start_params[p] for p in params]
		
		# the target objective function to minimize
		def _target(x,sim):
			for i,p in enumerate(params):
				sim.set_model_param(p, x[i])

			if use_bounds:
				m = self._apply_bounds()
				if m > 0:
					return sim.run(writeback=False)**(1.0+m)

			return sim.run(writeback=False)
	
		# optimize parameters
		opt = self._fitter( _target, x0, callback )

		ret = OrderedDict( (p,opt[0][i]) for i,p in enumerate(params) )
		self.sim.set_model_params(**ret)

		if update_fits:
			self.sim.run()
		else: # restore initial parameter values
			self.sim.set_model_params(**start_params)
	
		# return the optimized parameters and the chisquare value
		return ret,opt[1]
		
[docs]	def estimate(self, params, method='bootstrap', *args, **kwargs ):
		"""Wrapper for the two methods of estimating uncertainties in the fitted parameter values
		
		Arguments
		--------
		params : list of strings
			The names of the model parameters to find intervals for.
		method : string
			The method to use for interval estimation (either "sigma" or "bootstrap")
		*args
			Positional arguments to pass to either estimate_sigma or estimate_bootstrap
		**kwargs
			Keyword arguments to pass to either estimate_sigma or estimate_bootstrap
		
		Returns
		-------
		dict of tuples
			A parameter-name keyed dict of tuples consisting of the mean and standard deviation, or high and low values of the provided parameters.
		"""
					
		assert method in ['sigma','bootstrap']
	
		if method == 'sigma':
			return self.estimate_sigma( params, *args, **kwargs )
		else:
			return self.estimate_bootstrap( params, *args, **kwargs )
	
[docs]	def estimate_sigma(self, params=[], params_opt=None, sigma=None, stdevs=1, estimate=0.1, rootfinder='bisect', tolerance=0.001 ):
		"""Generate high and low parameter value estimates for optimized parameters according to the provided criterion
				
		Arguments
		---------
		params : list of strings
			The names of the model parameters to find intervals for.
		params_opt : list of strings
			The names of the model parameters to optimize. Setting to None implies that all available model parameters will be optimized (except the parameter that is being evaluated).
		sigma : float
			The critical chisq value of the overall fit to use for estimating a parameter's expected maximum and minimum (to be used in lieu of the stdevs parameter) 
		stdevs : int
			The number of standard deviations above the best-fit chi-square value to use as the critical cutoff for estimating a parameter's expected maximum and minimum (to be used in lieu of the sigma parameter)
		estimate : float
			A guess of how large the change from the optimized parameter values should be (up or down) in order to generate the critical chi-square value.
		rootfinder : string
			The algorithm used to find the root of the target function ("bisect" or "secant")
		tolerance : float
			The tolerance (in standard deviations) used to find the critical parameter values. Smaller numbers mean a more accurate estimation of a parameter's estimated maximum and minimum values.

		Returns
		-------
		(dict of tuples)
			A parameter-name keyed dict of tuples consisting of the high and low estimates for each of the parameters in the "params" argument.
		"""
		
		assert rootfinder in ("bisect","secant")
		
		model_params = self.sim.get_model_params()

		# starting parameter values to restore later
		start_params = self.sim.get_model_params().copy()
		
		if sigma == None: # calculate the expected sigma based on the number of observations
			# Andrae, Rene, Tim Schulze-Hartung, and Peter Melchior. "Dos and don'ts of reduced chi-squared." arXiv preprint arXiv:1012.3754 (2010).
			sigma = stdevs * (numpy.sqrt( 2.0 / sum([ e.npoints - len(e.skip) for e in self.sim.experiments ]) ))
		
		# the critical chisq is the point where the low and high parameter estimates are obtained
		critical_chisq = self.sim.run() + sigma
				
		param_values = {}
		for p in params:
			param_values[p] = [None,None]

			if params_opt == None: # get the model parameters that are to be optimized while the param "p" is gridded
				params_opt = list(self.sim.get_model_params().keys())

			try: # remove the parameter to be held constant from the list of those to be optimized
				params_opt.remove(p)
			except ValueError:
				pass

			def target_function( x ): # return the discrepancy between the critical chisq and the chisq of the best fit when parameter p is fixed to argument x
				self.sim.set_model_param(p,x)
				return critical_chisq - self.optimize(params_opt)[1]

			# find a LOW value for the model parameter "p" that exceeds the critical chi-square value (to serve as a bracketing point for starting the boundary search)
			self.sim.set_model_params(**start_params)
			estimate_counter,param_values[p][0] = estimate,model_params[p] * (1.0-estimate)
			chisq_diff = target_function( param_values[p][0] )
			while chisq_diff > 0: # if necessary, decrease the fixed parameter value until we've exceeded the critical chisq
				estimate_counter = estimate_counter + estimate
				if self.verbose:
					print("itc_fit: Lower guess (%f) for parameter \"%s\" is insufficient (%f from critical chisq). Decreasing parameter value to %f." % (param_values[p][0],p,chisq_diff,model_params[p] * (1.0-estimate_counter)))
				param_values[p][0] = model_params[p] * (1.0-estimate_counter)
				chisq_diff = target_function( param_values[p][0] )
			
			# find the low param value that provides the desired confidence interval
			if rootfinder == "secant":
				param_values[p][0] = scipy.optimize.brentq(target_function, model_params[p], param_values[p][0], xtol=tolerance)
			else:
				param_values[p][0] = sum(self._noisy_bisect(target_function, model_params[p], param_values[p][0], sigma, chisq_diff, tolerance))/2.0
			
			# find a HIGH value for the model parameter "p" that exceeds the critical chi-square value (to serve as a bracketing point for starting the boundary search)
			self.sim.set_model_params(**start_params)
			estimate_counter,param_values[p][1] = estimate,model_params[p] * (1.0+estimate)
			chisq_diff = target_function( param_values[p][1] )
			while chisq_diff > 0: # if necessary, increase the fixed parameter value until we've exceeded the critical chisq
				estimate_counter = estimate_counter + estimate
				if self.verbose:
					print("itc_fit: Upper guess (%f) for parameter \"%s\" is insufficient (%f from critical chisq). Increasing parameter value to %f." % (param_values[p][1],p,chisq_diff,model_params[p] * (1.0+estimate_counter)))
				param_values[p][1] = model_params[p] * (1.0+estimate_counter)
				chisq_diff = target_function( param_values[p][1] )
			
			# find the high param value that provides the desired confidence interval
			if rootfinder == "secant":
				param_values[p][1] = scipy.optimize.brentq(target_function, model_params[p], param_values[p][1], xtol=tolerance)
			else:
				param_values[p][1] = sum(self._noisy_bisect(target_function, model_params[p], param_values[p][1], sigma, chisq_diff, tolerance))/2.0

			# From chapter 2 tutorial data (stdevs=2, rootfinder='secant'):
			#{'dG': [-10.727396269988125, -11.070049429908266], 'dCp': [-0.09939296467688882, -0.1524609718306775], 'dH': [-11.493031681092932, -12.022877990152883], 'n': [1.778168415517038, 1.8276370690304506]}
			
		# restore initial parameter values for the next iteration
		self.sim.set_model_params(**start_params)
			
		return param_values
				
[docs]	def estimate_bootstrap(self, params=[], bootstraps=100, randomize=0.1, callback=None, logfile=None ):
		"""Generate confidence intervals for optimized parameters by bootstrapping (also known as jackknife estimation)

		Arguments
		---------
		params : list of strings
			The names of the model parameters to find intervals for.
		bootstraps : int
			The number of synthetic datasets to refit.
		randomize : float
			A fraction by which to perturb the starting parameters before optimization.
		callback : function
			A callback function to call at each optimization step, will be passed a vector of the current parameter values.
		logfile : string
			A file path to write the optimized parameter values for each bootstrap dataset

		Returns
		-------
		(dict of tuples)
			A parameter-name keyed dict of tuples consisting of the mean and standard deviation of the optimized values.
		"""

		if self.verbose:
			print("\nitc_fit: Estimating uncertainty intervals for %s using %i bootstraps\n"%(",".join(params),bootstraps))

		# initialize to make sure we have fit data to use
		self.sim.run()

		# starting parameter values to restore later
		start_params = self.sim.get_model_params().copy()

		# store the original experimental and fit data for restoration later
		dQ_exp,dQ_fit = [],[]
		for E in self.sim.experiments:
			assert len(E.dQ_exp) == len(E.dQ_fit)
			dQ_exp.append( E.dQ_exp[:] )
			dQ_fit.append( E.dQ_fit[:] )

		# generates a synthetic dataset from the fit and fit residuals
		def _make_bootstrap(n,exp,fit):
			res = [ exp[i]-fit[i] for i in range(n) ]
			return [ f + random.choice(res) for f in exp ]

		param_values = OrderedDict( (p,[]) for p in params )
		for i in range(bootstraps):
			if self.verbose:
				print("itc_fit: Bootstrap %i"%(i))

			# randomize starting point by user-specifiable amount
			for p in params:
				self.sim.set_model_param(p,self.sim.get_model_param(p) * (1+(2*random.random()-0.5)*randomize))

			# replace the experimental data points with the synthetic data
			for j,E in enumerate(self.sim.experiments):
				E.dQ_exp = _make_bootstrap(E.npoints,dQ_exp[j],dQ_fit[j])

			# re-optimize the parameters using the new, synthetic datasets
			optimized,chisq = self.optimize(params)

			# append the optimized parameter values to our growing array
			for p in optimized:
				param_values[p].append( optimized[p] )

			if callback != None:
				callback(optimized)

			if logfile != None:
				write_params_to_file(logfile,optimized,header=(i==0),post="%.3f"%(chisq))

			# restore initial parameter values for the next iteration
			self.sim.set_model_params(**start_params)

		# restore original experimental data and fit for the experiments
		for i,E in enumerate(self.sim.experiments):
			E.dQ_exp = dQ_exp[i]
			E.dQ_fit = dQ_fit[i]

		# return the mean and standard deviation of the model parameters
		return OrderedDict( (p,(numpy.mean(param_values[p]),numpy.std(param_values[p]))) for p in params )

	def _apply_bounds(self):
		ret = 0
		for k,v in self.sim.get_model_params().items():
			if self.bounds[k][0] != None and v < self.bounds[k][0]:
				self.sim.set_model_param(k, self.bounds[k][0])
				ret += numpy.fabs((self.bounds[k][0] - v) / self.bounds[k][0])
				if self.verbose:
					print("itc_fit: Boundary violation for \"%s\" (%f<%f)"%(k,v,self.bounds[k][0]))

			elif self.bounds[k][1] != None and v > self.bounds[k][1]:
				self.sim.set_model_param(k, self.bounds[k][1])
				ret += fabs((v - self.bounds[k][1]) / self.bounds[k][1])			
				if self.verbose:
					print("itc_fit: Boundary violation for \"%s\" (%f>%f)"%(k,v,self.bounds[k][1]))

		return ret

	def _fitter(self, func, x0, callback=None):
		# wrapper function for a variety of optimization algorithms
		# note that some of these aren't fully integrated yet
		if self.method == 'simplex':
			ret = scipy.optimize.fmin(
				func=func,
				x0=x0,
				args=(self.sim,),
				disp=self.verbose,
				full_output=True,
				callback=callback,
				**self.method_args)
		elif self.method == 'powell':
			ret = scipy.optimize.fmin_powell(
				func=func,
				x0=x0,
				args=(self.sim,),
				disp=self.verbose,
				full_output=True,
				callback=callback,
				**self.method_args)
		elif self.method == 'tnc':
			opt = scipy.optimize.fmin_tnc(
				func=func,
				x0=x0,
				args=(self.sim,),
				approx_grad=True,
				disp=self.verbose,
				**self.method_args)[0]
			ret = opt,func(opt,self.sim)
		elif self.method == 'bfgs':
			ret = scipy.optimize.fmin_l_bfgs_b(
				func=func,
				x0=x0,
				args=(self.sim,),
				approx_grad=True,
				disp=self.verbose,
				**self.method_args)
		else:
			raise Exception('Unrecognized fitting algorithm')

		return ret