# Copyright 2017 Marco Galardini and John Lees
'''Functions to read data into pyseer and iterate over instances'''
import sys
from .utils import set_env
import re
import gzip
# avoid numpy taking up more than one thread
with set_env(MKL_NUM_THREADS='1',
NUMEXPR_NUM_THREADS='1',
OMP_NUM_THREADS='1'):
import numpy as np
import pandas as pd
from sklearn import manifold
from pysam import VariantFile
import hashlib
import binascii
import pyseer.classes as var_obj
from .cmdscale import cmdscale
[docs]
def load_phenotypes(infile, column):
"""Load phenotypes vector
Args:
infile (str)
Matrix input file
column (str or None)
Phenotype column name or None to pick the last column
Returns:
p (pandas.Series)
Phenotype vector (n, 1)
"""
p = pd.read_csv(infile, index_col=0, sep='\t')
if p.shape[1] < 1:
sys.stderr.write('Phenotype file must contain at least one phenotype column\n')
sys.exit(1)
p.index = p.index.astype(str)
if np.any(p.index.duplicated()):
sys.stderr.write('Phenotype file contains duplicated sample names\n')
sys.exit(1)
if column is None:
p = p[p.columns[-1]]
else:
p = p[column]
# Remove missing values
p = p.dropna()
# Check type
if not np.issubdtype(p.values.dtype, np.number):
sys.stderr.write('Phenotypes must be numeric\n')
sys.exit(1)
return p
[docs]
def load_structure(infile, p, max_dimensions, mds_type="classic", n_cpus=1,
seed=None):
"""Load population structure and apply multidimensional scaling
Args:
infile (str)
Population structure (distance matrix) input file
p (pandas.Series)
Phenotype vector (n, 1)
max_dimensions (int)
Maximum dimensions to consider when applying
`metric` or `non-metric` MDS
mds_type (str)
MDS algorithm to apply. One of `classic`,
`metric` or `non-metric`. Any other input will trigger
the `metric` MDS
n_cpus (int)
Number of CPUs to be used for the `metric` or `non-metric` MDS
seed (int or None)
Random seed for `metric` or `non-metric` MDS, None if not required
Returns:
m (pandas.DataFrame)
Population structure after MDS (n, m)
"""
m = pd.read_csv(infile,
index_col=0,
sep='\t')
m.index = m.index.astype(str)
if np.any(m.index.duplicated()):
sys.stderr.write('Structure file contains duplicated sample names\n')
sys.exit(1)
sys.stderr.write("Structure matrix has dimension " + str(m.shape) + "\n")
# Also take intersection here, so that MDS isn't done using samples not present
# in sample
intersecting_samples = p.index.intersection(m.index).intersection(m.columns)
m = m.loc[intersecting_samples, intersecting_samples]
if len(intersecting_samples) == 0:
sys.stderr.write('None of the phenotyped samples were found in ' +
'population structure matrix\n')
sys.exit(1)
# MDS
if mds_type == "classic":
projection, evals = cmdscale(m)
else:
metric_mds = True
if mds_type == "non-metric":
metric_mds = False
elif mds_type != "metric":
sys.stderr.write("Unsupported mds type chosen. Assuming metric\n")
mds = manifold.MDS(n_components=max_dimensions, metric=metric_mds, n_jobs=n_cpus,
dissimilarity='precomputed',
random_state=seed)
projection = mds.fit_transform(m.values)
m = pd.DataFrame(projection,
index=m.index)
for i in range(m.shape[1]):
m[i] = m[i] / max(abs(m[i]))
return m
[docs]
def load_lineage(infile, p):
"""Load custom lineage clusters definitions
Args:
infile (str)
Input file for lineage clusters
p (pandas.Series)
Phenotypes vector (n, 1)
Returns:
result (tuple of (numpy.array, list))
Lineage binary matrix and cluster labels
"""
lin = pd.Series([x.rstrip().split()[1]
for x in open(infile)],
index=[x.split()[0]
for x in open(infile)])
lin.index = lin.index.astype(str)
if np.any(lin.index.duplicated()):
lin = lin.loc[~lin.index.duplicated()] # tilde is unary inversion
if (len(p.index.difference(lin.index)) > 0):
sys.stderr.write("All samples with a phenotype must be present in lineage file\n")
sys.exit(1)
else:
lin = lin.loc[p.index.intersection(lin.index)]
lineages = sorted(set(lin.values))
lineage_design_mat = []
lineage_assign = []
for categ in lineages:
lineage_design_mat.append(pd.Series([1 if x == categ
else 0
for x in lin.values],
index=lin.index))
lineage_assign.append(categ)
lineage_design_mat = pd.concat(lineage_design_mat, axis=1)
assert(np.all(lineage_design_mat.index == p.index))
return(lineage_design_mat.values, lineage_assign)
[docs]
def load_covariates(infile, covariates, p):
"""Load and encode a covariates matrix
Args:
infile (str)
Input file for the covariates matrix
covariates (iterable or None)
List of string indicating which columns to use and their
interpretation. Example: `2q` indicates that the second column
from the file is a quantitative variable, `2` indicates that
that same column is categorical. If None, the matrix is loaded
but nothing is done with it.
p (pandas.Series)
Phenotypes vector (n, 1)
Returns:
cov (pandas.DataFrame)
Covariance matrix (n, m)
"""
c = pd.read_csv(infile,
index_col=0,
header=0,
sep='\t')
c.index = c.index.astype(str)
if np.any(c.index.duplicated()):
sys.stderr.write('Covariate file contains duplicated sample names\n')
sys.exit(1)
if (len(p.index.difference(c.index)) > 0):
sys.stderr.write("All samples with a phenotype must be present in covariate file\n")
sys.exit(1)
else:
c = c.loc[p.index.intersection(c.index)]
# which covariates to use?
if covariates is None:
cov = pd.DataFrame([])
else:
cov = []
for col in covariates:
cnum = int(col.rstrip('q'))
if cnum == 1 or cnum > c.shape[1] + 1:
sys.stderr.write('Covariates columns values should be '
'> 1 and less than or equal to total number of ' +
'columns (%d)\n' % (c.shape[1] + 1))
return None
if col[-1] == 'q':
# quantitative
cov.append(c.iloc[:,cnum-2])
else:
# categorical, dummy-encode it
categories = set(c.iloc[:,cnum-2])
categories.pop()
for i, categ in enumerate(categories):
cov.append(pd.Series([1 if x == categ
else 0
for x in
c.iloc[:,cnum-2].values],
index=c.index,
name=c.columns[cnum-2] + "_" + str(i)))
if len(cov) > 0:
cov = pd.concat(cov, axis=1)
else:
cov = pd.DataFrame([])
return cov
[docs]
def load_burden(infile, burden_regions):
"""Load burden regions for VCF analysis
Args:
infile (str)
Input file for burden regions
burden_regions (list)
List to be filled in-place
"""
with open(infile, "r") as region_file:
for line in region_file:
name, regions = line.rstrip().split()
region = (name, [])
for section in regions.split(','):
region[1].append(section)
burden_regions.append(region)
[docs]
def open_variant_file(var_type, var_file, burden_file, burden_regions, uncompressed):
"""Open a variant file for use as an iterable
Args:
var_type (str)
Type of variants file (kmers, vcf, Rtab)
var_file (str)
Location of file
burden_file (str)
File containing regions to group burden tests
burden_regions (list)
List of burden regions to be filled in-place
uncompressed (bool)
True if kmer file is not gzipped
"""
sample_order = []
if var_type == "kmers":
if uncompressed:
infile = open(var_file)
else:
infile = gzip.open(var_file, 'r')
elif var_type == "vcf":
infile = VariantFile(var_file)
if burden_file:
load_burden(burden_file, burden_regions)
else:
# Rtab files have a header, rather than sample names accessible by row
infile = open(var_file)
header = infile.readline().rstrip()
sample_order = [str(x) for x in header.split()[1:]]
return infile, sample_order
[docs]
def read_variant(infile, p, var_type, burden, burden_regions,
uncompressed, all_strains, sample_order,
keep_list = None, noparse = False):
"""Read input line and parse depending on input file type
Return a variant name and pres/abs vector
Args:
infile (opened file)
Handle to opened variant file
p (pandas.Series)
Phenotypes vector (n, 1)
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
uncompressed (bool)
Whether the kmers file is uncompressed
all_strains (set-like)
All sample labels that should be present
sample_order
Samples order to interpret each Rtab line
keep_list (dict)
Variant names to properly read, any other will
return None
(default = None)
noparse (bool)
Set True to skip line without parsing and
return None, irrespective of presence in
skip_list
(default = False)
Returns:
eof (bool)
Whether we are at the end of the file
k (numpy.array)
Variant presence/absence vector
var_name (str)
Variant name
kstrains (list)
Samples in which the variant is present
nkstrains (list)
Samples in which the variant is absent
af (float)
Allele frequency
missing (float)
Missing frequency
"""
if var_type not in {'kmers', 'vcf', 'Rtab'}:
raise ValueError('Variants type not supported')
if var_type == "vcf":
# burden tests read through regions and slice vcf
if burden:
if len(burden_regions) > 0:
line_in = burden_regions.popleft()
else: # Last; to raise exception on next loop
line_in = None
# read single vcf line
else:
try:
line_in = next(infile)
except StopIteration:
line_in = None
else:
# kmers and Rtab plain text files
line_in = infile.readline()
if not line_in or noparse:
eof = True
return(eof, None, None, None, None, None, None)
else:
eof = False
d = {}
if var_type == "kmers":
if not uncompressed:
line_in = line_in.decode()
var_name, strains = (line_in.split()[0],
line_in.rstrip().split(
'|')[1].lstrip().split())
if keep_list != None and var_name not in keep_list:
return(eof, None, None, None, None, None, None)
else:
d = {str(x.split(':')[0]): 1
for x in strains}
elif var_type == "vcf":
if not burden:
var_name = read_vcf_var(line_in, d, keep_list)
if var_name is None:
return (eof, None, None, None, None, None, None)
else:
# burden test. Regions are named contig:start-end.
# Start is non-inclusive, so start one before to include
(var_name, regions) = line_in
for region in regions:
region = re.match('^(.+):(\d+)-(\d+)$', region)
if region:
# Adds presence to d for every variant
# observation in region
for variant in infile.fetch(region.group(1),
int(region.group(2)) - 1,
int(region.group(3))):
var_sub_name = read_vcf_var(variant, d, keep_list)
else: # stop trying to make 'fetch' happen
sys.stderr.write("Could not parse region %s\n" %
str(region))
return (eof, None, None, None, None, None, None)
elif var_type == "Rtab":
try:
split_line = line_in.rstrip().split('\t')
except TypeError:
# python3 unicode corner case
split_line = line_in.decode().rstrip().split('\t')
var_name, strains = split_line[0], split_line[1:]
if keep_list != None and var_name not in keep_list:
return (eof, None, None, None, None, None, None)
# sanity check
if len(strains) == 0:
raise ValueError('No sample data found; is this a Rtab file?')
# sanity check
if len(strains) != len(sample_order):
raise ValueError('Unexpected mismatch between header and data row')
for present, sample in zip(strains, sample_order):
# sanity check
if present not in {'0', '1', '.', ''}:
raise ValueError('Rtab file not binary')
if present == '1':
d[sample] = 1
elif present == "." or present == "":
d[sample] = np.nan
# Use common dictionary to format design matrix etc
kstrains = sorted(set(d.keys()).intersection(all_strains)) # This will include missing
nkstrains = sorted(all_strains.difference(set(kstrains)))
# default for missing samples is absent kmer
for x in nkstrains:
d[x] = 0
af = float(len(kstrains)) / len(all_strains)
if len(kstrains) == 0:
sys.stderr.write("No observations of " + var_name + " in selected samples\n")
k = np.array([d[x] for x in p.index
if x in d])
missing = float(np.sum(np.isnan(k))) / len(all_strains)
return (eof, k, var_name, kstrains, nkstrains, af, missing)
[docs]
def read_vcf_var(variant, d, keep_list = None):
"""Parses vcf variants from pysam
Returns None if filtered variant. Mutates passed dictionary d
Args:
variant (pysam.libcbcf.VariantRecord)
Variant to be parsed
d (dict)
Dictionary to be populated in-place
keep_list (list)
List of variants to read
"""
var_name = "_".join([variant.contig, str(variant.pos)] +
[str(allele) for allele in variant.alleles])
if keep_list == None or var_name in keep_list:
# Do not support multiple alleles. Use 'bcftools norm' to split these
if variant.alts != None and len(variant.alts) > 1:
sys.stderr.write("Multiple alleles at %s_%s. Skipping\n" %
(variant.contig, str(variant.pos)))
var_name = None
elif len(variant.filter.keys()) > 0 and "PASS" not in variant.filter.keys():
var_name = None
else:
for sample, call in variant.samples.items():
# This is dominant encoding. Any instance of '1' will count as present
# Could change to additive, summing instances, or reccessive only counting
# when all instances are 1.
# Shouldn't matter for bacteria, but some people call hets
for haplotype in call.get('GT', [None]):
# if sample already present in dictionary do not overwrite
# important for burden testing
if (haplotype is None or str(haplotype) == ".") and sample not in d:
d[sample] = np.nan
elif haplotype is not None and haplotype != 0 and str(haplotype) != ".":
d[sample] = 1
break
# if sample already present as missing and haplotype is '0'
# then remove
elif sample in d and np.isnan(d[sample]) and str(haplotype) != '.':
del d[sample]
else:
var_name = None
return(var_name)
[docs]
def iter_variants(p, m, cov, var_type, burden, burden_regions, infile,
all_strains, sample_order, lineage_effects, lineage_clusters,
min_af, max_af, max_missing, filter_pvalue, lrt_pvalue, null_fit,
firth_null, uncompressed, continuous):
"""Make an iterable to pass single variants to fixed effects regression
Args:
p (pandas.DataFrame)
Phenotype vector (n, 1)
m (numpy.array)
Population structure matrix (n, k)
cov (pandas.DataFrame)
Covariates matrix (n, m)
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
infile (opened file)
Handle to opened variant file
all_strains (set-like)
All sample labels that should be present
sample_order
Sampes order to interpret each Rtab line
lineage_effects (bool)
Whether to fit lineage effects
lineage clusters (list)
Lineage clusters indexes
min_af (float)
Minimum allele frequency (inclusive)
max_af (float)
maximum allele frequency (inclusive)
max_missing (float)
maximum missing frequency
filter_pvalue (float)
Pre-filtering p-value threshold
lrt_pvalue (float)
Filtering p-value threshold
null_fit (float or statsmodels.regression.linear_model.RegressionResultsWrapper)
Null-fit likelihood (binary) or model (continuous)
firth_null (float)
Firth regression likelihood
uncompressed (bool)
Whether the kmers file is uncompressed
continuous (bool)
Whether the phenotype is continuous or not
Returns:
var_name (str)
Variant name
v (numpy.array)
Phenotypes vector (n, 1)
k (numpy.array)
Variant presence/absence vector (n, 1)
m (numpy.array)
Population structure matrix (n, k)
c (numpy.array)
Covariates matrix (n, m)
af (float)
Allele frequency
pattern (bytes)
Variant hash
lineage_effects (bool)
Whether to fit lineage effects
lineage clusters (list)
Lineage clusters indexes
filter_pvalue (float)
Pre-filtering p-value threshold
lrt_pvalue (float)
Filtering p-value threshold
null_fit (float or statsmodels.regression.linear_model.RegressionResultsWrapper)
Null-fit likelihood (binary) or model (continuous)
firth_null (float)
Firth regression likelihood
kstrains (iterable)
Sample labels with the variant
nkstrains (iterable)
Sample labels without the variant
continuous (bool)
Whether the phenotype is continuous or not
"""
while True:
eof, k, var_name, kstrains, nkstrains, af, missing = read_variant(infile,
p,
var_type,
burden,
burden_regions,
uncompressed,
all_strains,
sample_order)
# check for EOF
if eof:
return
v = p.values
c = cov.values
if k is not None:
pattern = hash_pattern(k)
else:
pattern = None
if (k is None) or not (min_af <= af <= max_af) or (missing > max_missing):
# TODO: currently the way to know if a variant was
# af filtered is to check that phenotypes vector is None
# a more robust way could be better
yield (var_name, None, k, m, c, af, pattern,
lineage_effects, lineage_clusters,
filter_pvalue, lrt_pvalue, null_fit, firth_null,
kstrains, nkstrains, continuous)
else:
yield (var_name, v, k, m, c, af, pattern,
lineage_effects, lineage_clusters,
filter_pvalue, lrt_pvalue, null_fit, firth_null,
kstrains, nkstrains, continuous)
[docs]
def iter_variants_lmm(variant_iter, lmm, h2,
lineage, lineage_clusters,
covariates, continuous, filter_pvalue, lrt_pvalue):
"""Make an iterable to pass single variants to fixed effects regression"""
for variants, variant_mat, eof in variant_iter:
if len(variants) == 0:
break
yield (lmm, h2, variants, variant_mat, lineage,
lineage_clusters, covariates,
continuous, filter_pvalue, lrt_pvalue)
if eof:
break
# Loads a block of variants into memory for use with LMM
[docs]
def load_var_block(var_type, p, burden, burden_regions, infile,
all_strains, sample_order, min_af, max_af,
max_missing, uncompressed, block_size):
"""Make in iterable to load blocks of variants for LMM
Args:
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
p (pandas.DataFrame)
Phenotype vector (n, 1)
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
infile (opened file)
Handle to opened variant file
all_strains (set-like)
All sample labels that should be present
sample_order
Sampes order to interpret each Rtab line
min_af (float)
Minimum allele frequency (inclusive)
max_af (float)
maximum allele frequency (inclusive)
max_missing (float)
maximum missing frequency
uncompressed (bool)
Whether the kmers file is uncompressed
block_size (int)
How many variants to be loaded at once
Returns:
variants (iterable)
A collection of pyseer.classes.LMM objects describing the
loaded variants (n,)
variant_mat (numpy.array)
Variant bloack presence/absence matrix (n, block_size)
eof (bool)
Whether we are at the end of the file
"""
while True:
variants = []
# pre-allocation of memory
variant_mat = np.zeros((len(p), block_size))
for var_idx in range(block_size):
eof, k, var_name, kstrains, nkstrains, af, missing = read_variant(
infile, p, var_type,
burden, burden_regions,
uncompressed, all_strains,
sample_order)
# check for EOF
if eof:
break
if k is None or af < min_af or af > max_af or missing > max_missing:
pattern = None
else:
pattern = hash_pattern(k)
variant_mat[:, var_idx] = k
variants.append((var_obj.LMM(var_name, pattern, af, np.nan,
np.nan, np.nan, np.nan, np.nan,
np.nan, kstrains, nkstrains,
set(), True, True),
p.values, k))
yield (variants, variant_mat, eof)
if eof:
break
yield None, None, True
[docs]
def hash_pattern(k):
"""Calculates the hash of a presence/absence vector
Args:
k (numpy.array)
Variant presence/absence binary vector (n, 1)
Returns:
hash (byte)
Hashed pattern
"""
pattern = k.view(np.uint8)
hashed = hashlib.md5(pattern)
return (binascii.b2a_base64(hashed.digest()))
[docs]
def file_hash(filename):
"""Calculates the hash of an entire file on disk
Args:
filename (str)
Location of file on disk
Returns:
hash (str)
SHA256 checksum
"""
hash_sha256 = hashlib.sha256()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()