# -*- coding: utf-8 -*-
# Copyright 2018 Marco Galardini and John Lees
'''Elastic net model implementations'''
import os
import sys
from .utils import set_env
# avoid numpy taking up more than one thread
with set_env(MKL_NUM_THREADS='1',
NUMEXPR_NUM_THREADS='1',
OMP_NUM_THREADS='1'):
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix, hstack
import math
import pandas as pd
from decimal import Decimal
from tqdm import tqdm
from sklearn.metrics import r2_score, confusion_matrix
import glmnet_python
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPredict import cvglmnetPredict
import pyseer.classes as var_obj
from .input import read_variant
from .model import pre_filtering
from .model import fit_lineage_effect
from .model import fixed_effects_regression
# Loads all variants into memory for use with elastic net
[docs]
def load_all_vars(var_type, p, burden, burden_regions, infile,
all_strains, sample_order, min_af, max_af,
max_missing, uncompressed):
"""Load all variants in the input file into a sparse
matrix representation
Args:
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
p (pandas.DataFrame)
Phenotype vector (n, 1)
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
infile (opened file)
Handle to opened variant file
all_strains (set-like)
All sample labels that should be present
sample_order
Sampes order to interpret each Rtab line
min_af (float)
Minimum allele frequency (inclusive)
max_af (float)
maximum allele frequency (inclusive)
max_missing (float)
maximum missing frequency
uncompressed (bool)
Whether the kmers file is uncompressed
Returns:
variants (scipy.sparse.csr_matrix)
A sparse matrix representation of all variants in
the input
selected_vars (list)
0-Indices of variants in the input file in variants
(which passed AF filtering)
var_idx (int)
The number of read variants (number of rows of variants)
"""
# For building sparse matrix
data = []
indices = []
indptr = [0]
selected_vars = []
var_idx = 0
pbar = tqdm(unit="variants")
while True:
eof, k, var_name, kstrains, nkstrains, af, missing = read_variant(
infile, p, var_type,
burden, burden_regions,
uncompressed, all_strains,
sample_order)
# check for EOF
if eof:
pbar.close()
break
else:
pbar.update(1)
if k is not None and af > min_af and af < max_af and missing < max_missing:
# Minor allele encoding - most efficient use of sparse structure
if af > 0.5:
pres = 0
else:
pres = 1
for idx, obs in enumerate(k):
if obs == pres:
indices.append(idx)
data.append(1)
indptr.append(len(indices))
selected_vars.append(var_idx)
var_idx += 1
# construct sparse matrix
if (len(selected_vars)) == 0:
raise ValueError("No variants passed filters")
variants = csr_matrix((data, indices, indptr), dtype=float,
shape=(len(selected_vars), len(all_strains)))
return(variants, selected_vars, var_idx)
[docs]
def fit_enet(p, variants, covariates, weights, continuous, alpha,
lineage_dict = None, fold_ids = None, n_folds = 10, n_cpus = 1):
"""Fit an elastic net model to a set of variants. Prints
information about model fit and prediction quality to STDERR
Args:
p (pandas.DataFrame)
Phenotype vector (n, 1)
variants (scipy.sparse.csc_matrix)
Wide sparse matrix representation of all variants to fit to
(rows = samples, columns = variants)
covariates (pandas.DataFrame)
Covariate matrix (n, j)
weights (np.array)
Vector of sample weights (n, 1)
continuous (bool)
If True fit a Gaussian error model, otherwise Bionomial error
alpha (float)
Between 0-1, sets the mix between ridge regression and lasso
regression
lineage_dict (list)
Names of lineages, indices corrsponding to fold_ids
[default = None]
fold_ids (list)
Index of fold assignment for cross-validation, from 0 to 1-n_folds
[default = None]
n_folds (int)
Number of folds in cross-validation
[default = 10]
n_cpus (int)
Number of processes to use in cross-validation
Set to -1 to use all available
[default = 1]
Returns:
betas (numpy.array)
The fitted betas (slopes) for each variant
"""
if continuous:
regression_type = 'gaussian'
else:
regression_type = 'binomial'
if covariates.shape[0] > 0:
variants = hstack([csc_matrix(covariates.values), variants])
# Run model fit
if fold_ids is None:
enet_fit = cvglmnet(x = variants, y = p.values.astype('float64'), family = regression_type,
nfolds = n_folds, alpha = alpha, parallel = n_cpus, weights = weights)
else:
enet_fit = cvglmnet(x = variants, y = p.values.astype('float64'), family = regression_type,
foldid = fold_ids, alpha = alpha, parallel = n_cpus, weights = weights)
# Extract best lambda and predict class labels/values
betas = cvglmnetCoef(enet_fit, s = 'lambda_min')
best_lambda_idx = np.argmin(enet_fit['cvm'])
predictions, R2 = enet_predict(enet_fit, variants, continuous, p.values)
# Write some summary stats
# R^2 = 1 - sum((yi_obs - yi_predicted)^2) /sum((yi_obs - yi_mean)^2)
sys.stderr.write("Best penalty (lambda) from cross-validation: " +
'%.2E' % Decimal(enet_fit['lambda_min'][0]) + "\n")
if not continuous:
sys.stderr.write("Best model deviance from cross-validation: " +
'%.3f' % Decimal(enet_fit['cvm'][best_lambda_idx]) +
" ± " + '%.2E' % Decimal(enet_fit['cvsd'][best_lambda_idx]) + "\n")
sys.stderr.write("Best R^2 from cross-validation: " + '%.3f' % Decimal(R2) + "\n")
# Report R2 for each fold (strain/clade)
if fold_ids is not None:
sys.stderr.write("Predictions within each lineage\n")
write_lineage_predictions(p.values, predictions, fold_ids,
lineage_dict, continuous)
return(betas.reshape(-1,))
[docs]
def enet_predict(enet_fit, variants, continuous, responses = None):
"""Use a fitted elastic net model to make predictions about
new observations. Returns accuracy if true responses known
Args:
enet_fit (cvglmnet)
An elastic net model fitted using cvglmnet or similar
variants (scipy.sparse.csc_matrix)
Wide sparse matrix representation of all variants to predict with
(rows = samples, columns = variants)
continuous (bool)
True if a continuous phenotype, False if a binary phenotype
responses (np.array)
True phenotypes to calculate R^2 with
[default = None]
Returns:
preds (numpy.array)
Predicted phenotype for each input sample in variants
R2 (float)
Variance explained by model (or None if true labels not
provided).
"""
# Extract best lambda and predict class labels/values
if continuous:
preds = np.array(cvglmnetPredict(enet_fit, newx=variants, s='lambda_min', ptype='link'))
else:
preds = cvglmnetPredict(enet_fit, newx=variants, s='lambda_min', ptype='class')
# R^2 = 1 - sum((yi_obs - yi_predicted)^2) /sum((yi_obs - yi_mean)^2)
if responses is not None and responses.shape[0] == variants.shape[0]:
SStot = np.sum(np.square(responses - np.mean(responses)))
SSerr = np.sum(np.square(responses.reshape(-1, 1) - preds))
if SStot != 0:
R2 = 1 - (SSerr/SStot)
else: # Not defined for constant response
R2 = None
else:
R2 = None
return(preds, R2)
[docs]
def write_lineage_predictions(true_values, predictions, fold_ids,
lineage_dict, continuous, stderr_print=True):
"""Writes prediction ability stratified by lineage to stderr
Args:
true_values (np.array)
Observed values of phenotype
predictions (np.array)
Predicted phenotype values
lineage_dict (list)
Names of lineages, indices corrsponding to fold_ids
fold_ids (list)
Index of fold assignment for cross-validation, from 0 to 1-n_folds
continuous (bool)
True if a continuous phenotype, False if a binary phenotype
stderr_print (bool)
Print output to stderr
[default = True]
Returns:
R2_vals (list)
R2 values for each fold
confusion (list)
Tuple of tn, fp, fn, tp for each fold
"""
if stderr_print:
sys.stderr.write("\t".join(['Lineage', 'Size', 'R2']))
if not continuous:
sys.stderr.write("\t" + "\t".join(['TP', 'TN', 'FP', 'FN']))
sys.stderr.write("\n")
if np.any(fold_ids) == None:
fold_ids = np.zeros(true_values.shape[0], dtype=np.int8)
R2_vals = []
confusion = []
for fold in range(max(fold_ids) + 1):
samples_in_fold = np.where(fold_ids == fold)[0]
y_true = true_values[samples_in_fold]
y_pred = predictions[samples_in_fold].reshape(-1, )
if np.all(y_true == y_true[0]):
fold_R2 = np.nan
else:
fold_R2 = r2_score(y_true, y_pred)
R2_vals.append(fold_R2)
if stderr_print:
sys.stderr.write("\t".join([lineage_dict[fold],
str(samples_in_fold.shape[0]),
'%.3f' % Decimal(fold_R2)]))
if not continuous:
if np.all(y_true == y_pred) and np.all(y_true == 1):
tn, fp, fn, tp = (0, 0, 0, y_true.shape[0])
elif np.all(y_true == y_pred) and np.all(y_true == 0):
tn, fp, fn, tp = (y_true.shape[0], 0, 0, 0)
else:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
confusion.append((tn, fp, fn, tp))
if stderr_print:
sys.stderr.write("\t" + "\t".join([str(x) for x in [tp, tn, fp, fn]]))
if stderr_print:
sys.stderr.write("\n")
return(R2_vals, confusion)
[docs]
def correlation_filter(p, all_vars, quantile_filter = 0.25):
"""Calculates correlations between phenotype and variants,
giving those that are above the specified quantile
Args:
p (pandas.DataFrame)
Phenotype vector (n, 1)
all_vars (scipy.sparse.csr_matrix)
Narrow sparse matrix representation of all variants to fit to
(rows = variants, columns = samples)
quantile_filter (float)
The quantile to discard at e.g. 0.25, retain top 75%
[default = 0.25]
Returns:
cor_filter (numpy.array)
The indices of variants passing the filter
"""
# a = snp - mean(snp)
# b = y - mean(y)
# cor = abs(a%*%b / sqrt(sum(a^2)*sum(b^2)) )
b = p.values - np.mean(p.values)
sum_b_squared = np.sum(np.power(b, 2))
# NOTE: I couldn't get this to multithread efficiently using sparse matrices...
# might work if the matrix was divided into chunks of rows first, but maybe not
# worth it as it's pretty quick anyway
correlations = []
for row_idx in tqdm(range(all_vars.shape[0]), unit="variants"):
k = all_vars.getrow(row_idx)
k_mean = csr_matrix.mean(k)
if k_mean == 0:
# avoid crashes due to an empty sparse vector
correlations.append([np.nan])
else:
ab = k.dot(b) - np.sum(k_mean * b)
sum_a_squared = k.dot(k.transpose()).data[0] - 2*k_mean*csr_matrix.sum(k) + pow(k_mean, 2) * all_vars.shape[1]
cor = np.abs(ab / np.sqrt(sum_a_squared * sum_b_squared))
correlations.append(cor)
cor_filter = np.nonzero(correlations > np.percentile(correlations, quantile_filter*100))[0]
return(cor_filter)
[docs]
def find_enet_selected(enet_betas, var_indices, p, c, var_type, fit_seer, burden,
burden_regions, infile, all_strains, sample_order,
continuous, find_lineage, lin, uncompressed):
"""Read through the variant input file again, yielding just those variants
which had a non-zero slope for printing
Args:
enet_betas (numpy.array)
Fitted slopes of intercept, covariants and variants
from elastic net
var_indices (list)
The 0-indexed locations (in the original file) of
variants represented in enet_betas
p (pandas.DataFrame)
Phenotype vector (n, 1)
c (numpy.array)
Covariate matrix (n, j)
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
fit_seer (tuple: m, null_model, null_firth)
Distance projection and null models required to
fit fixed effect regression
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
infile (opened file)
Handle to opened variant file
all_strains (set-like)
All sample labels that should be present
sample_order
Sample order to interpret each Rtab line
continuous (bool)
Is phenotype/fit continuous?
lineage_effects (bool)
Whether to fit lineages or not
lin (numpy.array)
Lineages matrix (n, k)
uncompressed (bool)
Whether the kmers file is uncompressed
Returns:
variant (var_obj.Enet)
Iterable of processed variants for printing
"""
# skip intercept and covariates
enet_betas = enet_betas[c.shape[1]+1:]
current_var = 0
for beta, var_idx in zip(enet_betas, var_indices):
# Only need to process selected variants
if beta == 0:
continue
while current_var < var_idx:
eof, k, var_name, kstrains, nkstrains, af, missing = read_variant(
infile, p, var_type,
burden, burden_regions,
uncompressed, all_strains,
sample_order, noparse=True)
current_var += 1
eof, k, var_name, kstrains, nkstrains, af, missing = read_variant(
infile, p, var_type,
burden, burden_regions,
uncompressed, all_strains,
sample_order)
current_var += 1
notes = []
# find pvalues and lineages
if fit_seer != None:
m, null_res, null_firth = fit_seer
seer_fit = fixed_effects_regression(var_name, p.values, k, m, c, af, None,
find_lineage, lin,
1, 1, null_res, null_firth,
kstrains, nkstrains, continuous)
pval = seer_fit.prep
adj_pval = seer_fit.pvalue
max_lineage = seer_fit.max_lineage
notes = seer_fit.notes
# find just unadjusted p-value and lineage
else:
(pval, bad) = pre_filtering(p.values, k, continuous)
adj_pval = math.nan
if bad:
notes.append("bad-chisq")
if find_lineage:
max_lineage = fit_lineage_effect(lin, c, k)
else:
max_lineage = None
yield var_obj.Enet(var_name, af, pval, adj_pval, beta, max_lineage, kstrains, nkstrains, notes)