Source code for pyseer.lmm

# Copyright 2017 Marco Galardini and John Lees
# Functions here are based on FaST-LMM Copyright 2014 Microsoft Corporation

'''LMM interface implementations'''

import os
import sys
from .utils import set_env
# avoid numpy taking up more than one thread
with set_env(MKL_NUM_THREADS='1',
             NUMEXPR_NUM_THREADS='1',
             OMP_NUM_THREADS='1'):
    import numpy as np
import math
import pandas as pd
from scipy import stats
import statsmodels.formula.api as smf

import pyseer.classes as var_obj
from .fastlmm.lmm_cov import LMM as lmm_cov

from .model import pre_filtering
from .model import fit_lineage_effect



[docs]
def initialise_lmm(p, cov, K_in, lmm_cache_in=None, lmm_cache_out=None, lineage_samples=None):
    """Initialises LMM using the similarity matrix
    see _internal_single in fastlmm.association.single_snp

    Args:
        p (pandas.Series)
            Phenotypes vector (n, 1)
        cov (pandas.DataFrame)
            Covariance matrix (n, m)
        K_in (str)
            Similarity matrix filename
        lmm_chache_in (str or None)
            Filename for an input LMM cache, None if it has to be computed
        lmm_chache_out (str or None)
            Filename to save the LMM cache, None otherwise.
        lineage_samples (list or None)
            Sample names used for lineage (must match K_in)

    Returns:
        p (pandas.Series)
            Phenotype vector with the samples present in the similarity matrix
        lmm (pyseer.fastlmm.lmm_cov.LMM)
            Initialised LMM model
        h2 (float)
            Trait's variance explained by covariates
    """
    if lmm_cache_in is not None and os.path.exists(lmm_cache_in):
        if len(p.index.intersection(cov.index)) == p.shape[0]:
            covar = np.c_[cov.loc[p.index].values, np.ones((p.shape[0], 1))]
        # empty covariate
        elif (cov.shape[0] == 0 and cov.shape[1] == 0) or len(cov.shape) == 0:
            covar = np.ones((p.shape[0], 1))
        # inform the user about missing covariate/phenotypes and exit
        # special case for LMM
        else:
            sys.stderr.write("Phenotype and covariate file should have "
                             "matching samples for LMM\n")
            sys.exit(1)
        y = np.reshape(p.values, (-1, 1))

        lmm = lmm_cov(X=covar, Y=y, G=None, K=None)
        with np.load(lmm_cache_in) as data:
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
            h2 = data['arr_2'][0]

            if (lmm.U.shape[0] != len(p)):
                sys.stderr.write("Phenotype different length from cache file\n")
                sys.exit(1)
    else:
        # read and normalise K
        K = pd.read_csv(K_in,
                        index_col=0,
                        sep='\t')
        K.index = K.index.astype(str)
        sys.stderr.write("Similarity matrix has dimension " + str(K.shape) + "\n")

        # If using lineages, check compatible with LMM
        if lineage_samples is not None and set(K.index) != set(lineage_samples):
            sys.stderr.write("Lineage file and similarity matrix contain different sets"
                             " of samples\n")
            sys.exit(1)

        intersecting_samples = p.index.intersection(K.index)
        sys.stderr.write("Analysing " + str(len(intersecting_samples)) + " samples"
                         " found in both phenotype and similarity matrix\n")
        p = p.loc[intersecting_samples]
        y = np.reshape(p.values, (-1, 1))
        K = K.loc[p.index, p.index]
        if len(p.index.intersection(cov.index)) == p.shape[0]:
            covar = np.c_[cov.loc[p.index].values, np.ones((p.shape[0], 1))]
        # empty covariate
        elif (cov.shape[0] == 0 and cov.shape[1] == 0) or len(cov.shape) == 0:
            covar = np.ones((p.shape[0], 1))
        # inform the user about missing covariate/phenotypes and exit
        # special case for LMM
        else:
            sys.stderr.write("Phenotype and covariate file should have "
                             "matching samples for LMM\n")
            sys.exit(1)

        factor = float(len(p)) / np.diag(K.values).sum()
        if factor == math.inf:
            sys.stderr.write("Invalid similarity matrix. Did you use --calc-C?\n")
            sys.exit(1)
        elif abs(factor-1.0) > 1e-15:
            K *= factor

        lmm = lmm_cov(X=covar, Y=y, K=K.values, G=None, inplace=True)
        result = lmm.findH2()
        h2 = result['h2']

        if lmm_cache_out is not None and not os.path.exists(lmm_cache_out):
            lmm.getSU()
            np.savez(lmm_cache_out, lmm.U, lmm.S, np.array([h2]))

    return(p, lmm, h2)




[docs]
def fit_lmm(lmm, h2, variants, variant_mat, lineage_effects,
            lineage_clusters, covariates, continuous,
            filter_pvalue, lrt_pvalue):
    """Fits LMM and returns LMM tuples for printing

    Args:
        lmm (pyseer.fastlmm.lmm_cov.LMM)
            Initialised LMM model
        h2 (float)
            Trait's variance explained by covariates
        variants (iterable)
            Tuples with variant object, phenotype vector and variant vector
            (pyseer.classes.LMM, numpy.array, numpy.array)
        variant_mat (numpy.array)
            Variants presence absence matrix (n, k)
        lineage_effects (bool)
            Whether to fit lineage effects
        lineage_clusters (numpy.array)
            Population structure matrix or lineage association
            binary matrix (n, k)
        covariates (numpy.array)
            Covariates matrix (n, m)
        continuous (bool)
            Whether the phenotype is continuous
        filter_pvalue (float)
            Pre-filtering p-value threshold
        lrt_pvalue (float)
            Post-fitting p-value threshold

    Returns:
        all_variants (iterable)
            All variant objects fitted or filtered
    """
    all_variants = []
    filtered_variants = []
    for var_idx, variant in enumerate(variants):
        notes = set()
        var, p, k = variant
        if var.pattern is None or k is None:
            notes.add('af-filter')
            all_variants.append(var._replace(notes=notes,
                                             prefilter=True,
                                             filter=False))
            variant_mat[:, var_idx] = np.zeros(variant_mat.shape[0])
            continue

        prep, bad_chisq = pre_filtering(p, k, continuous)
        if bad_chisq:
            notes.add('bad-chisq')
        if prep >= filter_pvalue or not np.isfinite(prep):
            notes.add('pre-filtering-failed')
            all_variants.append(var._replace(notes=notes,
                                             prep=prep,
                                             prefilter=True,
                                             filter=False))
            variant_mat[:, var_idx] = np.zeros(variant_mat.shape[0])
            continue
        var = var._replace(prep=prep,
                           notes=notes,
                           prefilter=False)
        filtered_variants.append(var)

    # remove empty rows from filtering
    variant_mat = variant_mat[:, ~np.all(variant_mat == 0, axis=0)]

    if variant_mat.shape[1] == 0:
        return all_variants

    # fit LMM to block
    res = fit_lmm_block(lmm, h2, variant_mat)
    assert len(res['p_values']) == len(filtered_variants), "length of LMM result does not match number of variants"

    passed_vars = []
    for lmm_result_idx, tested_variant in zip(range(len(res['p_values'])),
                                              filtered_variants):
        notes = tested_variant.notes
        if res['p_values'][lmm_result_idx] >= lrt_pvalue or not np.isfinite(
                                            res['p_values'][lmm_result_idx]):
            notes.add('lrt-filtering-failed')
            all_variants.append(tested_variant._replace(
                                      notes=notes,
                                      pvalue=res['p_values'][lmm_result_idx],
                                      filter=True))
        else:
            if lineage_effects:
                max_lineage = fit_lineage_effect(lineage_clusters,
                                                 covariates, k)
            else:
                max_lineage = None

            tested_variant = tested_variant._replace(
                    pvalue=res['p_values'][lmm_result_idx],
                    kbeta=res['beta'][lmm_result_idx],
                    bse=res['bse'][lmm_result_idx],
                    frac_h2=res['frac_h2'][lmm_result_idx],
                    notes=notes,
                    filter=False,
                    max_lineage=max_lineage)

            all_variants.append(tested_variant)

    return all_variants



[docs]
def fit_lmm_block(lmm, h2, variant_block):
    """Actually fits the LMM to numpy variant array
    see map/reduce section of _internal_single in fastlmm.association.single_snp

    Args:
        lmm (pyseer.fastlmm.lmm_cov.LMM)
            Initialised LMM model
        h2 (float)
            Trait's variance explained by covariates
        variant_block (numpy.array)
            Variants presence absence matrix (n, k)

    Returns:
        lmm_results (dict)
            LMM results for this variants block
    """
    res = lmm.nLLeval(h2=h2, dof=None, scale=1.0,
                      penalty=0.0, snps=variant_block)

    beta = res['beta']
    chi2stats = beta*beta/res['variance_beta']

    lmm_results = {}
    lmm_results['p_values'] = stats.f.sf(chi2stats,
                                         1,
                                         lmm.U.shape[0]-(lmm.linreg.D+1))[:, 0]
    lmm_results['beta'] = beta[:, 0]
    lmm_results['bse'] = np.sqrt(res['variance_beta'][:, 0])
    lmm_results['frac_h2'] = np.sqrt(
                        res['fraction_variance_explained_beta'][:, 0]
                                    )

    return(lmm_results)