Source code for sibreg

import numpy as np
from scipy.optimize import fmin_l_bfgs_b

[docs]class model(object): """Define a linear model with within-class correlations. Parameters ---------- y : :class:`~numpy:numpy.array` 1D array of phenotype observations X : :class:`~numpy:numpy.array` Design matrix for the fixed mean effects. labels : :class:`~numpy:numpy.array` 1D array of sample labels Returns ------- model : :class:`sibreg.model` """ def __init__(self,y,X,labels): # Get sample size self.n = X.shape[0] self.X=X # Label mapping self.label_counts = dict() self.label_indices = dict() for l in xrange(0,labels.shape[0]): if labels[l] not in self.label_counts: self.label_counts[labels[l]]=1 self.label_indices[labels[l]] = [l] else: self.label_counts[labels[l]]+=1 self.label_indices[labels[l]].append(l) self.y_lab = dict() self.X_lab = dict() for label in self.label_indices.iterkeys(): self.y_lab[label]=y[self.label_indices[label]] self.X_lab[label]=X[self.label_indices[label],:] self.n_labels = len(self.y_lab.keys()) # response self.y=y self.labels=labels # Compute MLE of alpha given variance parameters
[docs] def alpha_mle(self, tau, sigma2 = np.nan, compute_cov = False): """ Compute the MLE of alpha given variance parameters Parameters ---------- sigma2 : :class:`float` variance of model residuals tau : :class:`float` ratio of variance of model residuals to variance explained by mean differences between classes Returns ------- alpha : :class:`~numpy:numpy.array` MLE of alpha """ X_T_X = np.dot(self.X.T,self.X) X_T_y = np.dot(self.X.T,self.y).reshape((self.X.shape[1],1)) for label in self.y_lab.iterkeys(): X_sum = np.sum(self.X_lab[label],axis=0).reshape((1,self.X.shape[1])) y_sum = np.sum(self.y_lab[label],axis=0) X_T_X = X_T_X-np.dot(X_sum.T,X_sum)/(tau+self.label_counts[label]) X_T_y = X_T_y-np.dot(X_sum.T,y_sum)/(tau+self.label_counts[label]) alpha = np.linalg.solve(X_T_X,X_T_y) alpha = alpha.reshape((alpha.shape[0],)) if compute_cov: alpha_cov = sigma2*np.linalg.inv(X_T_X) return [alpha,alpha_cov] else: return alpha
# Compute likelihood of data given beta, alpha
[docs] def likelihood_and_gradient(self, sigma2, tau): """ Compute the loss function, which is -2 times the likelihood along with its gradient Parameters ---------- sigma2 : :class:`float` variance of model residuals tau : :class:`float` ratio of variance of model residuals to variance explained by mean differences between classes Returns ------- L, grad : :class:`float` loss function and gradient, divided by sample size """ ## Likelihood alpha = self.alpha_mle(tau) resid = self.y - self.X.dot(alpha) RSS = np.sum(np.square(resid)) L = self.n * np.log(sigma2)+RSS/sigma2 ## Gradient with respect to sigma2 grad_sigma2 = self.n/sigma2-RSS/np.square(sigma2) ## Gradient with respect to tau grad_tau = 0 for label in self.y_lab.iterkeys(): resid_label=resid[self.label_indices[label]] resid_sum = np.sum(resid_label) resid_square_sum = np.square(resid_sum) # Add to likelihood L = L - resid_square_sum/(sigma2*(tau+self.label_counts[label]))+np.log(1+self.label_counts[label]/tau) # Add to grad sigma2 grad_sigma2+=resid_square_sum/(np.square(sigma2)*(tau+self.label_counts[label])) # Add to grad tau grad_tau+=(resid_square_sum/sigma2-self.label_counts[label]*(1+self.label_counts[label]/tau))/np.square(tau+self.label_counts[label]) # Overall gradient vector grad = np.hstack((grad_sigma2,grad_tau)) return L/self.n, grad/self.n
[docs] def optimize_model(self,init_params): """ Find the parameters that minimise the loss function for a given regularisation parameter Parameters ---------- init_param : :class:`array` initial values for residual variance (sigma^2_epsilon) followed by ratio of residual variance to within-class variance (tau) Returns ------- optim : :class:`dict` dictionary with keys: 'success', whether optimisation was successful (bool); 'warnflag', output of L-BFGS-B algorithm giving warnings; 'sigma2', MLE of residual variance; 'tau', MLE of ratio of residual variance to within-class variance; 'likelihood', maximum of likelihood. """ # Paramtere boundaries parbounds=[(0.00001, None),(0.00001, None)] # Optimize optimized = fmin_l_bfgs_b(func=lik_and_grad,x0=init_params, args=(self.y, self.X, self.labels), bounds = parbounds) # Get MLE optim = {} optim['success'] = True optim['warnflag'] = optimized[2]['warnflag'] if optim['warnflag'] != 0: print('Optimization unsuccessful.') optim['success'] = False optim['sigma2'] = optimized[0][0] optim['tau'] = optimized[0][1] # Get parameter covariance optim['likelihood'] = -0.5 * np.float64(self.n) * (optimized[1] + np.log(2 * np.pi)) return optim
[docs] def predict(self,X): """ Predict new observations based on model regression coefficients Parameters ---------- X : :class:`array` matrix of covariates to predict from Returns ------- y : :class:`array` predicted values """ if hasattr(self,'alpha'): return X.dot(self.alpha) else: raise(AttributeError('Model does not have known regression coefficients. Try optimizing model first'))
def set_alpha(self,alpha): self.alpha = alpha
def lik_and_grad(pars,*args): # Wrapper for function to pass to L-BFGS-B y, X, labels = args mod = model(y,X,labels) return mod.likelihood_and_gradient(pars[0],pars[1])
[docs]def simulate(n,alpha,sigma2,tau): """Simulate from a linear model with correlated observations within-class. The mean for each class is drawn from a normal distribution. Parameters ---------- n : :class:`int` sample size alpha : :class:`~numpy:numpy.array` value of regression coefficeints sigma2 : :class:`float` variance of residuals tau : :class:`float` ratio of variance of residuals to variance of distribution of between individual means Returns ------- model : :class:`regrnd.model` linear model with repeated observations """ c = alpha.shape[0] X = np.random.randn((n * c)).reshape((n, c)) labels = np.random.choice(n,n) random_effects = np.sqrt(sigma2/tau)*np.random.randn(n) y = X.dot(alpha)+random_effects[labels-1]+np.random.randn(n)*np.sqrt(sigma2) return model(y,X,labels)