VCF filtration

Contents

VCF filtration#

VCF filtrations and a filterer to apply them.

class Filtration[source]#

Bases: ABC

Base class for filtering sites based on certain criteria.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

__init__()[source]#

Initialize filtration.

abstract filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant should be kept, False otherwise.

class MaskedFiltration(use_parser: bool = True, include_samples: List[str] | None = None, exclude_samples: List[str] | None = None)[source]#

Bases: Filtration, ABC

Filter sites based on a samples mask.

__init__(use_parser: bool = True, include_samples: List[str] | None = None, exclude_samples: List[str] | None = None)[source]#

Create a new filtration instance.

Parameters:
  • use_parser (bool) – Whether to use the samples mask from the parser, if used together with parser.

  • include_samples (Optional[List[str]]) – The samples to include, defaults to all samples.

  • exclude_samples (Optional[List[str]]) – The samples to exclude, defaults to no samples.

use_parser: bool#

Whether to use the samples mask from the parser, if used together with parser.

include_samples: Optional[List[str]]#

The samples to include.

exclude_samples: Optional[List[str]]#

The samples to exclude.

abstract filter_site(variant: cyvcf2.Variant | DummyVariant)#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant should be kept, False otherwise.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class SNPFiltration(use_parser: bool = True, include_samples: List[str] | None = None, exclude_samples: List[str] | None = None)[source]#

Bases: MaskedFiltration

Only keep SNPs. Note that this entails discarding mono-morphic sites.

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant is an SNP, False otherwise.

__init__(use_parser: bool = True, include_samples: List[str] | None = None, exclude_samples: List[str] | None = None)#

Create a new filtration instance.

Parameters:
  • use_parser (bool) – Whether to use the samples mask from the parser, if used together with parser.

  • include_samples (Optional[List[str]]) – The samples to include, defaults to all samples.

  • exclude_samples (Optional[List[str]]) – The samples to exclude, defaults to no samples.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

use_parser: bool#

Whether to use the samples mask from the parser, if used together with parser.

include_samples: Optional[List[str]]#

The samples to include.

exclude_samples: Optional[List[str]]#

The samples to exclude.

class SNVFiltration[source]#

Bases: Filtration

Only keep single site variants (discard indels and MNPs but keep monomorphic sites).

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant is kept, False otherwise.

__init__()#

Initialize filtration.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class PolyAllelicFiltration(use_parser: bool = True, include_samples: List[str] | None = None, exclude_samples: List[str] | None = None)[source]#

Bases: MaskedFiltration

Filter out poly-allelic sites.

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site. Note that we don’t check explicitly all alleles, but rather rely on ALT field.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant is not poly-allelic, False otherwise.

__init__(use_parser: bool = True, include_samples: List[str] | None = None, exclude_samples: List[str] | None = None)#

Create a new filtration instance.

Parameters:
  • use_parser (bool) – Whether to use the samples mask from the parser, if used together with parser.

  • include_samples (Optional[List[str]]) – The samples to include, defaults to all samples.

  • exclude_samples (Optional[List[str]]) – The samples to exclude, defaults to no samples.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

use_parser: bool#

Whether to use the samples mask from the parser, if used together with parser.

include_samples: Optional[List[str]]#

The samples to include.

exclude_samples: Optional[List[str]]#

The samples to exclude.

class AllFiltration[source]#

Bases: Filtration

Filter out all sites. Only useful for testing purposes.

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

False.

__init__()#

Initialize filtration.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class NoFiltration[source]#

Bases: Filtration

Do not filter out any sites. Only useful for testing purposes.

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True.

__init__()#

Initialize filtration.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class CodingSequenceFiltration[source]#

Bases: Filtration

Filter out sites that are not in coding sequences. This filter should find frequent use when parsing spectra for DFE inference as we only consider sites in coding sequences for this purpose. By using it, the annotation and parsing of unnecessary sites can be avoided which increases the speed. Note that we assume here that within contigs, sites in the GFF file are sorted by position in ascending order.

For this filtration to work, we require a GFF file (passed to Parser or Filterer).

__init__()[source]#

Create a new filtration instance.

cd: Optional[Series]#

The coding sequence enclosing the current variant or the closest one downstream.

n_processed: int#

The number of processed sites.

filter_site(v: cyvcf2.Variant | DummyVariant)[source]#

Filter site by whether it is in a coding sequence.

Parameters:

v (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant is in a coding sequence, False otherwise.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class DeviantOutgroupFiltration(outgroups: List[str], ingroups: List[str] = None, strict_mode: bool = True, retain_monomorphic: bool = True)[source]#

Bases: Filtration

Filter out sites where the major allele of the specified outgroup samples differs from the major allele of the ingroup samples.

__init__(outgroups: List[str], ingroups: List[str] = None, strict_mode: bool = True, retain_monomorphic: bool = True)[source]#

Construct DeviantOutgroupFiltration.

Parameters:
  • outgroups (List[str]) – The name of the outgroup samples to consider.

  • ingroups (List[str]) – The name of the ingroup samples to consider, defaults to all samples but the outgroups.

  • strict_mode (bool) – Whether to filter out sites where no outgroup sample is present, defaults to True.

  • retain_monomorphic (bool) – Whether to retain monomorphic sites, defaults to True, which is faster.

ingroups: Optional[List[str]]#

The ingroup samples.

outgroups: List[str]#

The outgroup samples.

strict_mode: bool#

Whether to filter out sites where no outgroup sample is present.

retain_monomorphic: bool#

Whether to retain monomorphic sites.

samples: Optional[ndarray]#

The samples found in the VCF file.

ingroup_mask: Optional[ndarray]#

The ingroup mask.

outgroup_mask: Optional[ndarray]#

The outgroup mask.

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant should be kept, False otherwise.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class ExistingOutgroupFiltration(outgroups: List[str], n_missing: int = 1)[source]#

Bases: Filtration

Filter out sites for which at least n_missing of the specified outgroup samples have no called base.

__init__(outgroups: List[str], n_missing: int = 1)[source]#

Construct ExistingOutgroupFiltration.

Parameters:
  • outgroups (List[str]) – The names of the outgroup samples considered.

  • n_missing (int) – The number of outgroup samples that need to be missing to fail the filter.

outgroups: List[str]#

The outgroup samples.

n_missing: int#

Minimum number of missing outgroups required to filter out a site.

samples: Optional[ndarray]#

The samples found in the VCF file.

outgroup_mask: Optional[ndarray]#

The outgroup mask.

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant should be kept, False otherwise.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class BiasedGCConversionFiltration[source]#

Bases: Filtration

Only retain A<->T and G<->C substitutions (which are unaffected by biased gene conversion, see [CITGB]).

Mono-allelic sites are always retained, and we assume sites are at most bi-allelic. Note that the number of mutational target sites is reduced by this filtration.

[CITGB]

Pouyet et al., ‘Background selection and biased gene conversion affect more than 95% of the human genome and bias demographic inferences.’, Elife, 7:e36317, 2018

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Remove bi-allelic sites that are not A<->T or G<->C mutations.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant should be kept, False otherwise.

__init__()#

Initialize filtration.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class ContigFiltration(contigs: List[str])[source]#

Bases: Filtration

Filter out sites that are not on the specified contigs.

__init__(contigs: List[str])[source]#

Construct ContigFiltration.

Parameters:

contigs (List[str]) – The contigs to retain.

contigs: List[str]#

The contigs to retain.

filter_site(variant: cyvcf2.Variant | DummyVariant)[source]#

Filter site.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to filter.

Return type:

bool

Returns:

True if the variant is on one of the specified contigs, False otherwise.

n_filtered: int = 0#

The number of sites that didn’t pass the filter.

class Filterer(vcf: str | Iterable[cyvcf2.Variant], output: str, gff: str | None = None, filtrations: List[Filtration] = [], info_ancestral: str = 'AA', max_sites: int = inf, seed: int | None = 0, cache: bool = True, aliases: Dict[str, List[str]] = {})[source]#

Bases: MultiHandler

Filter a VCF file using a list of filtrations.

Example usage:

import fastdfe as fd

# only keep variants in coding sequences
f = fd.Filterer(
    vcf="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/"
        "1000_genomes_project/release/20181203_biallelic_SNV/"
        "ALL.chr21.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz",
    gff="http://ftp.ensembl.org/pub/release-109/gff3/homo_sapiens/"
        "Homo_sapiens.GRCh38.109.chromosome.21.gff3.gz",
    output='sapiens.chr21.coding.vcf.gz',
    filtrations=[fd.CodingSequenceFiltration()],
    aliases=dict(chr21=['21'])
)

f.filter()
__init__(vcf: str | Iterable[cyvcf2.Variant], output: str, gff: str | None = None, filtrations: List[Filtration] = [], info_ancestral: str = 'AA', max_sites: int = inf, seed: int | None = 0, cache: bool = True, aliases: Dict[str, List[str]] = {})[source]#

Create a new filter instance.

Parameters:
  • vcf (Union[str, Iterable[cyvcf2.Variant]]) – The VCF file, possibly gzipped or a URL.

  • output (str) – The output file.

  • gff (str | None) – The GFF file, possibly gzipped or a URL. This argument is required for some filtrations.

  • filtrations (List[Filtration]) – The filtrations.

  • info_ancestral (str) – The info field for the ancestral allele.

  • max_sites (int) – The maximum number of sites to process.

  • seed (int | None) – The seed for the random number generator. Use None for no seed.

  • cache (bool) – Whether to cache files downloaded from urls.

  • aliases (Dict[str, List[str]]) – Dictionary of aliases for the contigs in the VCF file, e.g. {'chr1': ['1']}.

filtrations: List[Filtration]#

The filtrations.

output: str#

The output file.

n_filtered: int#

The number of sites that did not pass the filters.

is_filtered(variant: cyvcf2.Variant | DummyVariant)[source]#

Whether the given variant is kept.

Parameters:

variant (Union[cyvcf2.Variant, DummyVariant]) – The variant to check.

Return type:

bool

Returns:

True if the variant is kept, False otherwise.

filter()[source]#

Filter the VCF.

count_sites()#

Count the number of sites in the VCF.

Return type:

int

Returns:

Number of sites

classmethod download_file(url: str, cache: bool = True, desc: str = 'Downloading file')#

Download a file from a URL.

Parameters:
  • cache (bool) – Whether to cache the file.

  • url (str) – The URL to download the file from.

  • desc (str) – Description for the progress bar

Return type:

str

Returns:

The path to the downloaded file.

download_if_url(path: str)#

Download the VCF file if it is a URL.

Parameters:

path (str) – The path to the VCF file.

Return type:

str

Returns:

The path to the downloaded file or the original path.

get_aliases(contig: str)#

Get all aliases for the given contig alias including the primary alias.

Parameters:

contig (str) – The contig.

Return type:

List[str]

Returns:

The aliases.

get_contig(aliases, rewind: bool = True, notify: bool = True)#

Get the contig from the FASTA file.

Note that pyfaidx would be more efficient here, but there were problems when running it in parallel.

Parameters:
  • aliases – The contig aliases.

  • rewind (bool) – Whether to allow for rewinding the iterator if the contig is not found.

  • notify (bool) – Whether to notify the user when rewinding the iterator.

Return type:

SeqRecord

Returns:

The contig.

get_contig_names()#

Get the names of the contigs in the FASTA file.

Return type:

List[str]

Returns:

The contig names.

static get_filename(url: str)#

Return the file extension of a URL.

Parameters:

url (str) – The URL to get the file extension from.

Returns:

The file extension.

get_pbar(desc: str = 'Processing sites', total: int | None = 0)#

Return a progress bar for the number of sites.

Parameters:
  • desc (str) – Description for the progress bar

  • total (int | None) – Total number of items

Return type:

tqdm

Returns:

tqdm

static hash(s: str)#

Return a truncated SHA1 hash of a string.

Parameters:

s (str) – The string to hash.

Return type:

str

Returns:

The SHA1 hash.

static is_url(path: str)#

Check if the given path is a URL.

Parameters:

path (str) – The path to check.

Return type:

bool

Returns:

True if the path is a URL, False otherwise.

load_fasta(file: str)#

Load a FASTA file into a dictionary.

Parameters:

file (str) – The path to The FASTA file path, possibly gzipped or a URL

Return type:

FastaIterator

Returns:

Iterator over the sequences.

load_vcf()#

Load a VCF file into a dictionary.

Return type:

cyvcf2.VCF

Returns:

The VCF reader.

property n_sites: int#

Get the number of sites in the VCF.

Returns:

Number of sites

static remove_overlaps(df: DataFrame)#

Remove overlapping coding sequences.

Parameters:

df (DataFrame) – The coding sequences.

Return type:

DataFrame

Returns:

The coding sequences without overlaps.

static unzip_if_zipped(file: str)#

If the given file is gzipped, unzip it and return the path to the unzipped file. If the file is not gzipped, return the path to the original file.

Parameters:

file (str) – The path to the file.

Returns:

The path to the unzipped file, or the original file if it was not gzipped.

vcf#

The path to the VCF file or an iterable of variants

info_ancestral: str#

The tag in the INFO field that contains the ancestral allele

max_sites: int#

Maximum number of sites to consider

seed: Optional[int]#

Seed for the random number generator

rng#

Random generator instance

fasta: str#

The path to the FASTA file.

gff#

The GFF file path

cache: bool#

Whether to cache files that are downloaded from URLs

aliases#

The contig mappings