Source code for pyseer.enet

# -*- coding: utf-8 -*-
# Copyright 2018 Marco Galardini and John Lees

'''Elastic net model implementations'''

import os
import sys
from .utils import set_env
# avoid numpy taking up more than one thread
with set_env(MKL_NUM_THREADS='1',
             NUMEXPR_NUM_THREADS='1',
             OMP_NUM_THREADS='1'):
    import numpy as np
from scipy.sparse import csr_matrix, csc_matrix, hstack
import math
import pandas as pd
from decimal import Decimal
from tqdm import tqdm
from sklearn.metrics import r2_score, confusion_matrix

import glmnet_python
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPredict import cvglmnetPredict

import pyseer.classes as var_obj
from .input import read_variant
from .model import pre_filtering
from .model import fit_lineage_effect
from .model import fixed_effects_regression

# Loads all variants into memory for use with elastic net
[docs] def load_all_vars(var_type, p, burden, burden_regions, infile, all_strains, sample_order, min_af, max_af, max_missing, uncompressed): """Load all variants in the input file into a sparse matrix representation Args: var_type (str) Variants type (one of: kmers, vcf or Rtab) p (pandas.DataFrame) Phenotype vector (n, 1) burden (bool) Whether to slice a vcf file by burden regions burden_regions (collections.deque) Burden regions to slice the vcf with infile (opened file) Handle to opened variant file all_strains (set-like) All sample labels that should be present sample_order Sampes order to interpret each Rtab line min_af (float) Minimum allele frequency (inclusive) max_af (float) maximum allele frequency (inclusive) max_missing (float) maximum missing frequency uncompressed (bool) Whether the kmers file is uncompressed Returns: variants (scipy.sparse.csr_matrix) A sparse matrix representation of all variants in the input selected_vars (list) 0-Indices of variants in the input file in variants (which passed AF filtering) var_idx (int) The number of read variants (number of rows of variants) """ # For building sparse matrix data = [] indices = [] indptr = [0] selected_vars = [] var_idx = 0 pbar = tqdm(unit="variants") while True: eof, k, var_name, kstrains, nkstrains, af, missing = read_variant( infile, p, var_type, burden, burden_regions, uncompressed, all_strains, sample_order) # check for EOF if eof: pbar.close() break else: pbar.update(1) if k is not None and af > min_af and af < max_af and missing < max_missing: # Minor allele encoding - most efficient use of sparse structure if af > 0.5: pres = 0 else: pres = 1 for idx, obs in enumerate(k): if obs == pres: indices.append(idx) data.append(1) indptr.append(len(indices)) selected_vars.append(var_idx) var_idx += 1 # construct sparse matrix if (len(selected_vars)) == 0: raise ValueError("No variants passed filters") variants = csr_matrix((data, indices, indptr), dtype=float, shape=(len(selected_vars), len(all_strains))) return(variants, selected_vars, var_idx)
[docs] def fit_enet(p, variants, covariates, weights, continuous, alpha, lineage_dict = None, fold_ids = None, n_folds = 10, n_cpus = 1): """Fit an elastic net model to a set of variants. Prints information about model fit and prediction quality to STDERR Args: p (pandas.DataFrame) Phenotype vector (n, 1) variants (scipy.sparse.csc_matrix) Wide sparse matrix representation of all variants to fit to (rows = samples, columns = variants) covariates (pandas.DataFrame) Covariate matrix (n, j) weights (np.array) Vector of sample weights (n, 1) continuous (bool) If True fit a Gaussian error model, otherwise Bionomial error alpha (float) Between 0-1, sets the mix between ridge regression and lasso regression lineage_dict (list) Names of lineages, indices corrsponding to fold_ids [default = None] fold_ids (list) Index of fold assignment for cross-validation, from 0 to 1-n_folds [default = None] n_folds (int) Number of folds in cross-validation [default = 10] n_cpus (int) Number of processes to use in cross-validation Set to -1 to use all available [default = 1] Returns: betas (numpy.array) The fitted betas (slopes) for each variant """ if continuous: regression_type = 'gaussian' else: regression_type = 'binomial' if covariates.shape[0] > 0: variants = hstack([csc_matrix(covariates.values), variants]) # Run model fit if fold_ids is None: enet_fit = cvglmnet(x = variants, y = p.values.astype('float64'), family = regression_type, nfolds = n_folds, alpha = alpha, parallel = n_cpus, weights = weights) else: enet_fit = cvglmnet(x = variants, y = p.values.astype('float64'), family = regression_type, foldid = fold_ids, alpha = alpha, parallel = n_cpus, weights = weights) # Extract best lambda and predict class labels/values betas = cvglmnetCoef(enet_fit, s = 'lambda_min') best_lambda_idx = np.argmin(enet_fit['cvm']) predictions, R2 = enet_predict(enet_fit, variants, continuous, p.values) # Write some summary stats # R^2 = 1 - sum((yi_obs - yi_predicted)^2) /sum((yi_obs - yi_mean)^2) sys.stderr.write("Best penalty (lambda) from cross-validation: " + '%.2E' % Decimal(enet_fit['lambda_min'][0]) + "\n") if not continuous: sys.stderr.write("Best model deviance from cross-validation: " + '%.3f' % Decimal(enet_fit['cvm'][best_lambda_idx]) + " ± " + '%.2E' % Decimal(enet_fit['cvsd'][best_lambda_idx]) + "\n") sys.stderr.write("Best R^2 from cross-validation: " + '%.3f' % Decimal(R2) + "\n") # Report R2 for each fold (strain/clade) if fold_ids is not None: sys.stderr.write("Predictions within each lineage\n") write_lineage_predictions(p.values, predictions, fold_ids, lineage_dict, continuous) return(betas.reshape(-1,))
[docs] def enet_predict(enet_fit, variants, continuous, responses = None): """Use a fitted elastic net model to make predictions about new observations. Returns accuracy if true responses known Args: enet_fit (cvglmnet) An elastic net model fitted using cvglmnet or similar variants (scipy.sparse.csc_matrix) Wide sparse matrix representation of all variants to predict with (rows = samples, columns = variants) continuous (bool) True if a continuous phenotype, False if a binary phenotype responses (np.array) True phenotypes to calculate R^2 with [default = None] Returns: preds (numpy.array) Predicted phenotype for each input sample in variants R2 (float) Variance explained by model (or None if true labels not provided). """ # Extract best lambda and predict class labels/values if continuous: preds = np.array(cvglmnetPredict(enet_fit, newx=variants, s='lambda_min', ptype='link')) else: preds = cvglmnetPredict(enet_fit, newx=variants, s='lambda_min', ptype='class') # R^2 = 1 - sum((yi_obs - yi_predicted)^2) /sum((yi_obs - yi_mean)^2) if responses is not None and responses.shape[0] == variants.shape[0]: SStot = np.sum(np.square(responses - np.mean(responses))) SSerr = np.sum(np.square(responses.reshape(-1, 1) - preds)) if SStot != 0: R2 = 1 - (SSerr/SStot) else: # Not defined for constant response R2 = None else: R2 = None return(preds, R2)
[docs] def write_lineage_predictions(true_values, predictions, fold_ids, lineage_dict, continuous, stderr_print=True): """Writes prediction ability stratified by lineage to stderr Args: true_values (np.array) Observed values of phenotype predictions (np.array) Predicted phenotype values lineage_dict (list) Names of lineages, indices corrsponding to fold_ids fold_ids (list) Index of fold assignment for cross-validation, from 0 to 1-n_folds continuous (bool) True if a continuous phenotype, False if a binary phenotype stderr_print (bool) Print output to stderr [default = True] Returns: R2_vals (list) R2 values for each fold confusion (list) Tuple of tn, fp, fn, tp for each fold """ if stderr_print: sys.stderr.write("\t".join(['Lineage', 'Size', 'R2'])) if not continuous: sys.stderr.write("\t" + "\t".join(['TP', 'TN', 'FP', 'FN'])) sys.stderr.write("\n") if np.any(fold_ids) == None: fold_ids = np.zeros(true_values.shape[0], dtype=np.int8) R2_vals = [] confusion = [] for fold in range(max(fold_ids) + 1): samples_in_fold = np.where(fold_ids == fold)[0] y_true = true_values[samples_in_fold] y_pred = predictions[samples_in_fold].reshape(-1, ) if np.all(y_true == y_true[0]): fold_R2 = np.nan else: fold_R2 = r2_score(y_true, y_pred) R2_vals.append(fold_R2) if stderr_print: sys.stderr.write("\t".join([lineage_dict[fold], str(samples_in_fold.shape[0]), '%.3f' % Decimal(fold_R2)])) if not continuous: if np.all(y_true == y_pred) and np.all(y_true == 1): tn, fp, fn, tp = (0, 0, 0, y_true.shape[0]) elif np.all(y_true == y_pred) and np.all(y_true == 0): tn, fp, fn, tp = (y_true.shape[0], 0, 0, 0) else: tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() confusion.append((tn, fp, fn, tp)) if stderr_print: sys.stderr.write("\t" + "\t".join([str(x) for x in [tp, tn, fp, fn]])) if stderr_print: sys.stderr.write("\n") return(R2_vals, confusion)
[docs] def correlation_filter(p, all_vars, quantile_filter = 0.25): """Calculates correlations between phenotype and variants, giving those that are above the specified quantile Args: p (pandas.DataFrame) Phenotype vector (n, 1) all_vars (scipy.sparse.csr_matrix) Narrow sparse matrix representation of all variants to fit to (rows = variants, columns = samples) quantile_filter (float) The quantile to discard at e.g. 0.25, retain top 75% [default = 0.25] Returns: cor_filter (numpy.array) The indices of variants passing the filter """ # a = snp - mean(snp) # b = y - mean(y) # cor = abs(a%*%b / sqrt(sum(a^2)*sum(b^2)) ) b = p.values - np.mean(p.values) sum_b_squared = np.sum(np.power(b, 2)) # NOTE: I couldn't get this to multithread efficiently using sparse matrices... # might work if the matrix was divided into chunks of rows first, but maybe not # worth it as it's pretty quick anyway correlations = [] for row_idx in tqdm(range(all_vars.shape[0]), unit="variants"): k = all_vars.getrow(row_idx) k_mean = csr_matrix.mean(k) if k_mean == 0: # avoid crashes due to an empty sparse vector correlations.append([np.nan]) else: ab = k.dot(b) - np.sum(k_mean * b) sum_a_squared = k.dot(k.transpose()).data[0] - 2*k_mean*csr_matrix.sum(k) + pow(k_mean, 2) * all_vars.shape[1] cor = np.abs(ab / np.sqrt(sum_a_squared * sum_b_squared)) correlations.append(cor) cor_filter = np.nonzero(correlations > np.percentile(correlations, quantile_filter*100))[0] return(cor_filter)
[docs] def find_enet_selected(enet_betas, var_indices, p, c, var_type, fit_seer, burden, burden_regions, infile, all_strains, sample_order, continuous, find_lineage, lin, uncompressed): """Read through the variant input file again, yielding just those variants which had a non-zero slope for printing Args: enet_betas (numpy.array) Fitted slopes of intercept, covariants and variants from elastic net var_indices (list) The 0-indexed locations (in the original file) of variants represented in enet_betas p (pandas.DataFrame) Phenotype vector (n, 1) c (numpy.array) Covariate matrix (n, j) var_type (str) Variants type (one of: kmers, vcf or Rtab) fit_seer (tuple: m, null_model, null_firth) Distance projection and null models required to fit fixed effect regression burden (bool) Whether to slice a vcf file by burden regions burden_regions (collections.deque) Burden regions to slice the vcf with infile (opened file) Handle to opened variant file all_strains (set-like) All sample labels that should be present sample_order Sample order to interpret each Rtab line continuous (bool) Is phenotype/fit continuous? lineage_effects (bool) Whether to fit lineages or not lin (numpy.array) Lineages matrix (n, k) uncompressed (bool) Whether the kmers file is uncompressed Returns: variant (var_obj.Enet) Iterable of processed variants for printing """ # skip intercept and covariates enet_betas = enet_betas[c.shape[1]+1:] current_var = 0 for beta, var_idx in zip(enet_betas, var_indices): # Only need to process selected variants if beta == 0: continue while current_var < var_idx: eof, k, var_name, kstrains, nkstrains, af, missing = read_variant( infile, p, var_type, burden, burden_regions, uncompressed, all_strains, sample_order, noparse=True) current_var += 1 eof, k, var_name, kstrains, nkstrains, af, missing = read_variant( infile, p, var_type, burden, burden_regions, uncompressed, all_strains, sample_order) current_var += 1 notes = [] # find pvalues and lineages if fit_seer != None: m, null_res, null_firth = fit_seer seer_fit = fixed_effects_regression(var_name, p.values, k, m, c, af, None, find_lineage, lin, 1, 1, null_res, null_firth, kstrains, nkstrains, continuous) pval = seer_fit.prep adj_pval = seer_fit.pvalue max_lineage = seer_fit.max_lineage notes = seer_fit.notes # find just unadjusted p-value and lineage else: (pval, bad) = pre_filtering(p.values, k, continuous) adj_pval = math.nan if bad: notes.append("bad-chisq") if find_lineage: max_lineage = fit_lineage_effect(lin, c, k) else: max_lineage = None yield var_obj.Enet(var_name, af, pval, adj_pval, beta, max_lineage, kstrains, nkstrains, notes)