Source code for pyseer.model

# Copyright 2017 Marco Galardini and John Lees

'''Original SEER model (fixed effects) implementations'''

import os
import sys
from .utils import set_env
# avoid numpy taking up more than one thread
with set_env(MKL_NUM_THREADS='1',
             NUMEXPR_NUM_THREADS='1',
             OMP_NUM_THREADS='1'):
    import numpy as np
import math
import statsmodels
import pandas as pd
from scipy import stats
import statsmodels.formula.api as smf
# handle different versions of statsmodels
try:
    smf.OLS
except AttributeError:
    smf.OLS = statsmodels.regression.linear_model.OLS
try:
    smf.Logit
except AttributeError:
    smf.Logit = statsmodels.discrete.discrete_model.Logit

import pyseer.classes as var_obj


[docs] def pre_filtering(p, k, continuous): """Calculate a naive p-value from a chisq test (binary phenotype) or a t-test (continuous phenotype) which is not adjusted for population structure Args: p (numpy.array) Phenotypes vector (n, 1) k (numpy.array) Variant presence-absence vector (n, 1) continous (bool) Whether phenotypes are continuous or binary Returns: prep (float) Naive p-value bad_chisq (boolean) Whether the chisq test had small values in the contingency table """ bad_chisq = False if continuous: prep = stats.ttest_ind(p[k == 1], p[k == 0], equal_var=False)[1] else: t = np.concatenate((p.reshape(-1, 1), k.reshape(-1, 1)), axis=1).T table = [[t[0][(t[0] == 1) & (t[1] == 1)].shape[0], t[0][(t[0] == 1) & (t[1] == 0)].shape[0]], [t[0][(t[0] == 0) & (t[1] == 1)].shape[0], t[0][(t[0] == 0) & (t[1] == 0)].shape[0]]] # check for small values table = np.array(table) if table[table <= 1].shape[0] > 0 or table[table <= 5].shape[0] > 1: bad_chisq = True prep = stats.chi2_contingency(table, correction=False)[1] return(prep, bad_chisq)
[docs] def fit_null(p, m, cov, continuous, firth=False): """Fit the null model i.e. regression without k-mer `y ~ Wa` Returns log-likelihood Args: p (numpy.array) Phenotypes vector (n, 1) m (numpy.array) Population structure matrix (n, k) cov (pandas.DataFrame) Covariants dataframe (n, j) continous (bool) Whether phenotypes are continuous or binary firth (bool) For binary phenotypes whether to use firth regression Returns: null_res (statsmodels.regression.linear_model.RegressionResultsWrapper or float or None) Fitted model or log-likelihood (if firth) or None if could not fit """ v = np.ones(p.shape[0]).reshape(-1, 1) if m.shape[1] > 0: v = np.concatenate((v, m), axis=1) if cov.shape[1] > 0: v = np.concatenate((v, cov.values), axis=1) if continuous: null_mod = smf.OLS(p, v) else: start_vec = np.zeros(v.shape[1]) start_vec[0] = np.log(np.mean(p)/(1-np.mean(p))) null_mod = smf.Logit(p, v) # statsmodels does not raise exceptions with perfectly separable data # but just a warning # this may stop working in the future # source: https://github.com/statsmodels/statsmodels/blob/e12de82fabd8e3fd71413aae384763c1442c4516/statsmodels/discrete/discrete_model.py#L186 null_mod.raise_on_perfect_prediction = True try: if continuous: null_res = null_mod.fit(disp=False) else: if firth: firth_res = fit_firth(null_mod, start_vec, v, p) if firth_res is None: sys.stderr.write('Firth regression did not converge for null model\n') return None (intercept, kbeta, beta, bse, fitll) = firth_res null_res = fitll else: try: null_res = null_mod.fit(start_params=start_vec, method='newton', disp=False) # Null fit with default optimiser may fail, Powell # optimizer might work except np.linalg.linalg.LinAlgError: null_res = null_mod.fit(start_params=start_vec, method='powell', disp=False) except np.linalg.linalg.LinAlgError: sys.stderr.write('Matrix inversion error for null model\n') return None except statsmodels.tools.sm_exceptions.PerfectSeparationError: sys.stderr.write('Perfectly separable data error for null model\n') return None except statsmodels.tools.sm_exceptions.MissingDataError: sys.stderr.write('Missing data error for null model\n') return None return null_res
[docs] def fit_lineage_effect(lin, c, k): """Fits the model `k ~ Wa` using binomial error with logit link. W are the lineages (either a projection of samples, or cluster indicators) and covariates. Returns the index of the most significant lineage Args: lin (numpy.array) Population structure matrix or lineage association binary matrix (n, k) c (numpy.array) Covariants matrix (n, j) k (numpy.array) Variant presence-absence vector (n, 1) Returns: max_lineage (int or None) Index of the most significant lineage or None is could not fit """ if c.shape[0] == lin.shape[0]: X = np.concatenate((np.ones(lin.shape[0]).reshape(-1, 1), lin, c), axis=1) else: X = np.concatenate((np.ones(lin.shape[0]).reshape(-1, 1), lin), axis=1) lineage_mod = smf.Logit(k, X) # statsmodels does not raise exceptions with perfectly separable data # but just a warning # this may stop working in the future # source: https://github.com/statsmodels/statsmodels/blob/e12de82fabd8e3fd71413aae384763c1442c4516/statsmodels/discrete/discrete_model.py#L186 lineage_mod.raise_on_perfect_prediction = True try: lineage_res = lineage_mod.fit(method='newton', disp=False) wald_test = np.divide(np.absolute(lineage_res.params), lineage_res.bse) # excluding intercept and covariates max_lineage = np.argmax(wald_test[1:lin.shape[1]+1]) # In case regression fails except (statsmodels.tools.sm_exceptions.PerfectSeparationError, np.linalg.LinAlgError, statsmodels.tools.sm_exceptions.MissingDataError): max_lineage = None return max_lineage
[docs] def fixed_effects_regression(variant, p, k, m, c, af, pattern, lineage_effects, lin, pret, lrtt, null_res, null_firth, kstrains, nkstrains, continuous): """Fits the model `y ~ Xb + Wa` using either binomial error with logit link (binary traits) or Gaussian error (continuous traits) * `y` is the phenotype * `X` is the variant presence/absence (fixed effects) * `W` are covariate fixed effects, including population structure * `a` and `b` are slopes to be fitted Args: variant (str) Variant identifier p (numpy.array) Phenotype vector (binary or continuous) (n, 1) k (numpy.array) Variant presence/absence vector (n, 1) m (numpy.array) Population structure matrix (n, m) c (numpy.array) Covariants matrix (n, j) af (float) Allele frequency pattern (str) Variant hashed pattern lineage_effects (bool) Whether to fit lineages or not lin (numpy.array) Lineages matrix (n, k) pret (float) Pre-filtering p-value threshold lrtt (float) Post-fitting p-value threshold null_res (float or statsmodels.regression.linear_model.RegressionResultsWrapper) Null-fit likelihood (binary) or model (continuous) null_firth (float) Firth regression likelihood kstrains (iterable) Sample labels with the variant nkstrains (iterable) Sample labels without the variant continuous (bool) Whether the phenotype is continuous or not Returns: result (pyseer.classes.Seer) Results container """ notes = set() # was this af-filtered? if p is None: notes.add('af-filter') return var_obj.Seer(variant, pattern, af, np.nan, np.nan, np.nan, np.nan, np.nan, np.array([]), None, kstrains, nkstrains, notes, True, False) # pre-filtering prep, bad_chisq = pre_filtering(p, k, continuous) if bad_chisq: notes.add('bad-chisq') if prep > pret or not np.isfinite(prep): notes.add('pre-filtering-failed') return var_obj.Seer(variant, pattern, af, prep, np.nan, np.nan, np.nan, np.nan, np.array([]), None, kstrains, nkstrains, notes, True, False) # actual regression if m.shape[0] != k.shape[0]: # no distances if c.shape[0] == k.shape[0]: v = np.concatenate((np.ones(p.shape[0]).reshape(-1, 1), k.reshape(-1, 1), c), axis=1) else: v = np.concatenate((np.ones(p.shape[0]).reshape(-1, 1), k.reshape(-1, 1)), axis=1) elif c.shape[0] == m.shape[0]: # covariates and distances v = np.concatenate((np.ones(m.shape[0]).reshape(-1, 1), k.reshape(-1, 1), m, c), axis=1) else: # no covariates v = np.concatenate((np.ones(m.shape[0]).reshape(-1, 1), k.reshape(-1, 1), m), axis=1) try: if continuous: mod = smf.OLS(p, v) # statsmodels does not raise exceptions with perfectly separable data # but just a warning # this may stop working in the future # source: https://github.com/statsmodels/statsmodels/blob/e12de82fabd8e3fd71413aae384763c1442c4516/statsmodels/discrete/discrete_model.py#L186 mod.raise_on_perfect_prediction = True res = mod.fit() intercept = res.params[0] kbeta = res.params[1] beta = res.params[2:] bse = res.bse[1] lrt_pvalue = res.pvalues[1] #lrt_pvalue = res.compare_lr_test(null_res)[1] else: mod = smf.Logit(p, v) # statsmodels does not raise exceptions with perfectly separable data # but just a warning # this may stop working in the future # source: https://github.com/statsmodels/statsmodels/blob/e12de82fabd8e3fd71413aae384763c1442c4516/statsmodels/discrete/discrete_model.py#L186 mod.raise_on_perfect_prediction = True start_vec = np.zeros(v.shape[1]) start_vec[0] = np.log(np.mean(p)/(1-np.mean(p))) if not bad_chisq: try: res = mod.fit(start_params=start_vec, method='newton', disp=False) if res.bse[1] > 3: bad_chisq = True notes.add('high-bse') else: lrstat = -2*(null_res - res.llf) lrt_pvalue = 1 if lrstat > 0: # non-convergence lrt_pvalue = stats.chi2.sf(lrstat, 1) intercept = res.params[0] kbeta = res.params[1] beta = res.params[2:] bse = res.bse[1] except statsmodels.tools.sm_exceptions.PerfectSeparationError: bad_chisq = True notes.add('perfectly-separable-data') except np.linalg.linalg.LinAlgError: # singular matrix error bad_chisq = True notes.add('matrix-inversion-error') # Fit Firth regression with large SE, or nearly separable values if bad_chisq: firth_fit = fit_firth(mod, start_vec, v, p) if firth_fit is None: # Firth failure notes.add('firth-fail') return var_obj.Seer(variant, pattern, af, prep, np.nan, np.nan, np.nan, np.nan, np.array([]), None, kstrains, nkstrains, notes, False, True) else: intercept, kbeta, beta, bse, fitll = firth_fit beta = np.array(beta) lrstat = -2*(null_firth - fitll) lrt_pvalue = 1 if lrstat > 0: # check for non-convergence lrt_pvalue = stats.chi2.sf(lrstat, 1) except statsmodels.tools.sm_exceptions.MissingDataError: # if missing data or inf notes.add('missing-data-error') return var_obj.Seer(variant, pattern, af, prep, np.nan, np.nan, np.nan, np.nan, np.array([]), None, kstrains, nkstrains, notes, False, True) if lineage_effects: max_lineage = fit_lineage_effect(lin, c, k) else: max_lineage = None if lrt_pvalue > lrtt or not np.isfinite(lrt_pvalue) or not np.isfinite(kbeta): notes.add('lrt-filtering-failed') return var_obj.Seer(variant, pattern, af, prep, lrt_pvalue, kbeta, bse, intercept, beta, max_lineage, kstrains, nkstrains, notes, False, True) return var_obj.Seer(variant, pattern, af, prep, lrt_pvalue, kbeta, bse, intercept, beta, max_lineage, kstrains, nkstrains, notes, False, False)
[docs] def firth_likelihood(beta, logit): """Convenience function to calculate likelihood of Firth regression Args: beta (numpy.array) (n, 1) logit (statsmodels.discrete.discrete_model.Logit) Logistic model Returns: likelihood (float) Firth likelihood """ return -(logit.loglike(beta) + 0.5*np.log(np.linalg.det(-logit.hessian(beta))))
[docs] def fit_firth(logit_model, start_vec, X, y, step_limit=1000, convergence_limit=0.0001): """Do firth regression Args: logit (statsmodels.discrete.discrete_model.Logit) Logistic model start_vec (numpy.array) Pre-initialized vector to speed-up convergence (n, 1) X (numpy.array) (n, m) y (numpy.array) (n, ) step_limit (int) Maximum number of iterations convergence_limit (float) Convergence tolerance Returns: intercept (float) Intercept kbeta (float) Variant beta beta (iterable) Covariates betas (n-2) bse (float) Beta std-err fitll (float or None) Likelihood of fit or None if could not fit """ beta_iterations = [] beta_iterations.append(start_vec) for i in range(0, step_limit): pi = logit_model.predict(beta_iterations[i]) W = np.diagflat(np.multiply(pi, 1-pi)) var_covar_mat = np.linalg.pinv( -logit_model.hessian(beta_iterations[i]) ) # build hat matrix rootW = np.sqrt(W) H = np.dot(np.transpose(X), np.transpose(rootW)) H = np.matmul(var_covar_mat, H) H = np.matmul(np.dot(rootW, X), H) # penalised score U = np.matmul(np.transpose(X), y - pi + np.multiply(np.diagonal(H), 0.5 - pi)) new_beta = beta_iterations[i] + np.matmul(var_covar_mat, U) # step halving j = 0 while firth_likelihood(new_beta, logit_model) > firth_likelihood( beta_iterations[i], logit_model ): new_beta = beta_iterations[i] + 0.5*(new_beta - beta_iterations[i]) j = j + 1 if (j > step_limit): return None beta_iterations.append(new_beta) if i > 0 and (np.linalg.norm(beta_iterations[i] - beta_iterations[i-1]) < convergence_limit): break return_fit = None if np.linalg.norm(beta_iterations[i] - beta_iterations[i-1]) >= convergence_limit: pass else: # Calculate stats fitll = -firth_likelihood(beta_iterations[-1], logit_model) intercept = beta_iterations[-1][0] if len(beta_iterations[-1]) > 1: kbeta = beta_iterations[-1][1] bse = math.sqrt(-logit_model.hessian(beta_iterations[-1])[1, 1]) else: # Encountered when fitting null without any distances/covariates kbeta = None bse = None if len(beta_iterations[-1]) > 2: beta = beta_iterations[-1][2:].tolist() else: beta = None return_fit = intercept, kbeta, beta, bse, fitll return return_fit