#!/usr/bin/env perl
use lib '/home/raj/perl5/lib/perl5';
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=cut
=head1 NAME
Variant Effect Predictor - a script to predict the consequences of genomic variants
http://www.ensembl.org/info/docs/variation/vep/vep_script.html
Version 71
by Will McLaren (wm2@ebi.ac.uk)
=cut
use strict;
use Getopt::Long;
use FileHandle;
use CGI qw/:standard/;
use FindBin qw($Bin);
use lib $Bin;
use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
use Bio::EnsEMBL::Variation::Utils::VEP qw(
parse_line
vf_to_consequences
validate_vf
convert_to_vcf
load_dumped_adaptor_cache
dump_adaptor_cache
get_all_consequences
get_slice
build_full_cache
read_cache_info
get_time
debug
@OUTPUT_COLS
@REG_FEAT_TYPES
%FILTER_SHORTCUTS
);
# global vars
my $VERSION = '71';
# define headers that would normally go in the extra field
# keyed on the config parameter used to turn it on
my %extra_headers = (
protein => ['ENSP'],
canonical => ['CANONICAL'],
ccds => ['CCDS'],
hgvs => ['HGVSc','HGVSp'],
hgnc => ['HGNC'],
sift => ['SIFT'],
polyphen => ['PolyPhen'],
numbers => ['EXON','INTRON'],
domains => ['DOMAINS'],
regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'],
cell_type => ['CELL_TYPE'],
individual => ['IND','ZYG'],
xref_refseq => ['RefSeq'],
check_svs => ['SV'],
check_frequency => ['FREQS'],
gmaf => ['GMAF'],
maf_1kg => ['AFR_MAF','AMR_MAF','ASN_MAF','EUR_MAF'],
user => ['DISTANCE'],
check_existing => ['CLIN_SIG'],
biotype => ['BIOTYPE'],
);
my %extra_descs = (
'CANONICAL' => 'Indicates if transcript is canonical for this gene',
'CCDS' => 'Indicates if transcript is a CCDS transcript',
'HGNC' => 'HGNC gene identifier',
'ENSP' => 'Ensembl protein identifer',
'HGVSc' => 'HGVS coding sequence name',
'HGVSp' => 'HGVS protein sequence name',
'SIFT' => 'SIFT prediction',
'PolyPhen' => 'PolyPhen prediction',
'EXON' => 'Exon number(s) / total',
'INTRON' => 'Intron number(s) / total',
'DOMAINS' => 'The source and identifer of any overlapping protein domains',
'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position',
'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP',
'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP',
'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP',
'CELL_TYPE' => 'List of cell types and classifications for regulatory feature',
'IND' => 'Individual name',
'ZYG' => 'Zygosity of individual genotype at this locus',
'SV' => 'IDs of overlapping structural variants',
'FREQS' => 'Frequencies of overlapping variants used in filtering',
'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1 combined population',
'AFR_MAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1 combined African population',
'AMR_MAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1 combined American population',
'ASN_MAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1 combined Asian population',
'EUR_MAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1 combined European population',
'DISTANCE' => 'Shortest distance from variant to transcript',
'CLIN_SIG' => 'Clinical significance of variant from dbSNP',
'BIOTYPE' => 'Biotype of transcript',
);
my %ts_tv = (
'A/G' => 'Ts',
'G/A' => 'Ts',
'C/T' => 'Ts',
'T/C' => 'Ts',
'A/C' => 'Tv',
'C/A' => 'Tv',
'G/T' => 'Tv',
'T/G' => 'Tv',
'C/G' => 'Tv',
'G/C' => 'Tv',
'A/T' => 'Tv',
'T/A' => 'Tv',
);
my %colour_keys = (
'polyphen' => {
'unknown' => 'blue',
'benign' => 'green',
'possibly damaging' => 'orange',
'probably damaging' => 'red',
},
'sift' => {
'tolerated' => 'green',
'deleterious' => 'red',
},
# copied from COLOUR.ini in web code via browser to check colours
'consequences' => {
'intergenic_variant' => 'gray',
'intron_variant' => '#02599c',
'upstream_gene_variant' => '#a2b5cd',
'downstream_gene_variant' => '#a2b5cd',
'5_prime_utr_variant' => '#7ac5cd',
'3_prime_utr_variant' => '#7ac5cd',
'splice_region_variant' => '#ff7f50',
'splice_donor_variant' => '#ff7f50',
'splice_acceptor_variant' => '#ff7f50',
'frameshift_variant' => '#ff69b4',
'transcript_ablation' => '#ff0000',
'transcript_amplification' => '#ff69b4',
'inframe_insertion' => '#ff69b4',
'inframe_deletion' => '#ff69b4',
'synonymous_variant' => '#76ee00',
'stop_retained_variant' => '#76ee00',
'missense_variant' => '#ffd700',
'initiator_codon_variant' => '#ffd700',
'stop_gained' => '#ff0000',
'stop_lost' => '#ff0000',
'mature_mirna_variant' => '#458b00',
'non_coding_exon_variant' => '#32cd32',
'nc_transcript_variant' => '#32cd32',
'incomplete_terminal_codon_variant' => '#ff00ff',
'nmd_transcript_variant' => '#ff4500',
'coding_sequence_variant' => '#458b00',
'tfbs_ablation' => 'brown',
'tfbs_amplification' => 'brown',
'tf_binding_site_variant' => 'brown',
'regulatory_region_variant' => 'brown',
'regulatory_region_ablation' => 'brown',
'regulatory_region_amplification' => 'brown',
},
);
# set output autoflush for progress bars
$| = 1;
# configure from command line opts
my $config = &configure(scalar @ARGV);
# run the main sub routine
&main($config);
# this is the main sub-routine - it needs the configured $config hash
sub main {
my $config = shift;
debug("Starting...") unless defined $config->{quiet};
# this is for counting seconds
$config->{start_time} = time();
$config->{last_time} = time();
# this is for stats
$config->{stats}->{start_time} = get_time();
my $tr_cache = {};
my $rf_cache = {};
# create a hash to hold slices so we don't get the same one twice
my %slice_cache = ();
my @vfs;
my ($vf_count, $total_vf_count);
my $in_file_handle = $config->{in_file_handle};
# initialize line number in config
$config->{line_number} = 0;
# read the file
while(<$in_file_handle>) {
chomp;
$config->{line_number}++;
# header line?
if(/^\#/) {
# retain header lines if we are outputting VCF
if(defined($config->{vcf})) {
push @{$config->{headers}}, $_;
}
# line with sample labels in VCF
if(defined($config->{individual}) && /^#CHROM/) {
my @split = split /\s+/;
# no individuals
die("ERROR: No individual data found in VCF\n") if scalar @split <= 9;
# get individual column indices
my %ind_cols = map {$split[$_] => $_} (9..$#split);
# all?
if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) {
$config->{ind_cols} = \%ind_cols;
}
else {
my %new_ind_cols;
# check we have specified individual(s)
foreach my $ind(@{$config->{individual}}) {
die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind};
$new_ind_cols{$ind} = $ind_cols{$ind};
}
$config->{ind_cols} = \%new_ind_cols;
}
}
next;
}
# configure output file
$config->{out_file_handle} ||= &get_out_file_handle($config);
# some lines (pileup) may actually parse out into more than one variant
foreach my $vf(@{&parse_line($config, $_)}) {
$vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original});
# now get the slice
if(!defined($vf->{slice})) {
my $slice;
# don't get slices if we're using cache
# we can steal them from transcript objects later
if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
# check if we have fetched this slice already
if(defined $slice_cache{$vf->{chr}}) {
$slice = $slice_cache{$vf->{chr}};
}
# if not create a new one
else {
$slice = &get_slice($config, $vf->{chr});
# if failed, warn and skip this line
if(!defined($slice)) {
warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
next;
}
# store the hash
$slice_cache{$vf->{chr}} = $slice;
}
}
$vf->{slice} = $slice;
}
# validate the VF
next unless validate_vf($config, $vf);
# make a name if one doesn't exist
$vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term});
# jump out to convert here
if(defined($config->{convert})) {
&convert_vf($config, $vf);
next;
}
if(defined $config->{whole_genome}) {
push @vfs, $vf;
$vf_count++;
$total_vf_count++;
if($vf_count == $config->{buffer_size}) {
debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
# calculate stats
my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
$config->{last_time} = time();
debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
@vfs = ();
$vf_count = 0;
}
}
else {
print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
$vf_count++;
$total_vf_count++;
debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
}
}
}
# if in whole-genome mode, finish off the rest of the buffer
if(defined $config->{whole_genome} && scalar @vfs) {
debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
# calculate stats
my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
$config->{last_time} = time();
debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
}
debug($config->{stats}->{filter_count}, "/$total_vf_count variants remain after filtering") if (defined($config->{filter}) || defined($config->{check_frequency})) && !defined($config->{quiet});
debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
# finalise run-time stats
$config->{stats}->{var_count} = $total_vf_count;
$config->{stats}->{end_time} = get_time();
$config->{stats}->{run_time} = time() - $config->{start_time};
# write stats
unless(defined($config->{no_stats})) {
summarise_stats($config);
debug("Wrote stats summary to ".$config->{stats_file}) unless defined($config->{quiet});
}
# close HTML output
if(defined($config->{html}) && defined($config->{html_file_handle})) {
my $fh = $config->{html_file_handle};
print $fh "</tbody><tfoot><tr>".$config->{_th}."</tr></tfoot></table><p> </p></div></html></body>\n</html>\n";
$fh->close;
}
debug("Finished!") unless defined $config->{quiet};
}
# sets up configuration hash that is used throughout the script
sub configure {
my $args = shift;
my $config = {};
my @ARGV_copy = @ARGV;
$config->{stats}->{options} = \@ARGV_copy;
GetOptions(
$config,
'help', # displays help message
# input options,
'config=s', # config file name
'input_file|i=s', # input file name
'format=s', # input file format
# DB options
'species=s', # species e.g. human, homo_sapiens
'registry=s', # registry file
'host=s', # database host
'port=s', # database port
'user=s', # database user name
'password=s', # database password
'db_version=i', # Ensembl database version to use e.g. 62
'genomes', # automatically sets DB params for e!Genomes
'refseq', # use otherfeatures RefSeq DB instead of Ensembl
# runtime options
'most_severe', # only return most severe consequence
'summary', # only return one line per variation with all consquence types
'per_gene', # only return most severe per gene
'buffer_size=i', # number of variations to read in before analysis
'chunk_size=s', # size in bases of "chunks" used in internal hash structure
'failed=i', # include failed variations when finding existing
'no_whole_genome', # disables now default whole-genome mode
'whole_genome', # proxy for whole genome mode - now just warns user
'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
'check_ref', # check supplied reference allele against DB
'check_existing', # find existing co-located variations
'check_svs', # find overlapping structural variations
'check_alleles', # only attribute co-located if alleles are the same
'check_frequency', # enable frequency checking
'gmaf', # add global MAF of existing var
'maf_1kg', # add 1KG MAFs of existing vars
'freq_filter=s', # exclude or include
'freq_freq=f', # frequency to filter on
'freq_gt_lt=s', # gt or lt (greater than or less than)
'freq_pop=s', # population to filter on
'filter_common', # shortcut to MAF filtering
'allow_non_variant', # allow non-variant VCF lines through
'individual=s', # give results by genotype for individuals
'phased', # force VCF genotypes to be interpreted as phased
'fork=i', # fork into N processes
# verbosity options
'verbose|v', # print out a bit more info while running
'quiet', # print nothing to STDOUT (unless using -o stdout)
'no_progress', # don't display progress bars
# output options
'everything|e', # switch on EVERYTHING :-)
'output_file|o=s', # output file name
'html', # generate an HTML version of output
'stats_file|sf=s', # stats file name
'no_stats', # don't write stats file
'force_overwrite', # force overwrite of output file if already exists
'terms|t=s', # consequence terms to use e.g. NCBI, SO
'coding_only', # only return results for consequences in coding regions
'canonical', # indicates if transcript is canonical
'ccds', # output CCDS identifer
'xref_refseq', # output refseq mrna xref
'protein', # add e! protein ID to extra column
'biotype', # add biotype of transcript to output
'hgnc', # add HGNC gene ID to extra column
'hgvs', # add HGVS names to extra column
'sift=s', # SIFT predictions
'polyphen=s', # PolyPhen predictions
'condel=s', # Condel predictions
'regulatory', # enable regulatory stuff
'cell_type=s' => ($config->{cell_type} ||= []), # filter cell types for regfeats
'convert=s', # convert input to another format (doesn't run VEP)
'filter=s', # run in filtering mode
'no_intergenic', # don't print out INTERGENIC consequences
'gvf', # produce gvf output
'vcf', # produce vcf output
'original', # produce output in input format
'no_consequences', # don't calculate consequences
'lrg', # enable LRG-based features
'fields=s', # define your own output fields
'domains', # output overlapping protein features
'numbers', # include exon and intron numbers
# cache stuff
'database', # must specify this to use DB now
'cache', # use cache
'write_cache', # enables writing to the cache
'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
'no_adaptor_cache', # don't write adaptor cache
'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript
'strip', # strips adaptors etc from objects before caching them
'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
'cache_region_size=i', # size of region in bases for each cache file
'no_slice_cache', # tell API not to cache features on slice
'standalone', # standalone renamed offline
'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection
'skip_db_check', # don't compare DB parameters with cached
'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation
'tmpdir=s', # tmp dir used for BigWig retrieval
'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory
'fasta=s', # file or dir containing FASTA files with reference sequence
'freq_file=s', # file containing freqs to add to cache build
# debug
'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
'count_queries', # counts SQL queries executed
'admin', # allows me to build off public hosts
'debug', # print out debug info
'tabix', # experimental use tabix cache files
) or die "ERROR: Failed to parse command-line flags\n";
# print usage message if requested or no args supplied
if(defined($config->{help}) || !$args) {
&usage;
exit(0);
}
# config file?
if(defined $config->{config}) {
read_config_from_file($config, $config->{config});
}
# dir is where the cache and plugins live
$config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep');
# dir gets set to the specific cache directory later on, so take a copy to use
# when configuring plugins
$config->{toplevel_dir} = $config->{dir};
# ini file?
my $ini_file = $config->{dir}.'/vep.ini';
if(-e $ini_file) {
read_config_from_file($config, $ini_file);
}
# can't be both quiet and verbose
die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose});
# check forking
if(defined($config->{fork})) {
die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1;
# check we can use MIME::Base64
eval q{ use MIME::Base64; };
if($@) {
debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet});
delete $config->{fork};
}
else {
# try a practice fork
my $pid = fork;
if(!defined($pid)) {
debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet});
delete $config->{fork};
}
elsif($pid) {
waitpid($pid, 0);
}
elsif($pid == 0) {
exit(0);
}
}
}
# check file format
if(defined $config->{format}) {
die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i;
}
# check convert format
if(defined $config->{convert}) {
die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i;
# disable stats
$config->{no_stats} = 1;
}
if(defined($config->{fasta})) {
die "ERROR: Specified FASTA file/directory not found" unless -e $config->{fasta};
eval q{ use Bio::DB::Fasta; };
if($@) {
die("ERROR: Could not load required BioPerl module\n");
}
# try to overwrite sequence method in Slice
eval q{
package Bio::EnsEMBL::Slice;
# define a global variable so that we can pull in config hash
our $config;
{
# don't want a redefine warning spat out, thanks
no warnings 'redefine';
# overwrite seq method to read from FASTA DB
sub seq {
my $self = shift;
# special case for in-between (insert) coordinates
return '' if($self->start() == $self->end() + 1);
my $seq;
if(defined($config->{fasta_db})) {
$seq = $config->{fasta_db}->seq($self->seq_region_name, $self->start => $self->end);
reverse_comp(\$seq) if $self->strand < 0;
}
else {
return $self->{'seq'} if($self->{'seq'});
if($self->adaptor()) {
my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor();
return ${$seqAdaptor->fetch_by_Slice_start_end_strand($self,1,undef,1)};
}
}
# default to a string of Ns if we couldn't get sequence
$seq ||= 'N' x $self->length();
return $seq;
}
}
1;
};
if($@) {
die("ERROR: Could not redefine sequence method\n");
}
# copy to Slice for offline sequence fetching
$Bio::EnsEMBL::Slice::config = $config;
# spoof a coordinate system
$config->{coord_system} = Bio::EnsEMBL::CoordSystem->new(
-NAME => 'chromosome',
-RANK => 1,
);
debug("Checking/creating FASTA index") unless defined($config->{quiet});
$config->{fasta_db} = Bio::DB::Fasta->new($config->{fasta});
}
# check if user still using --standalone
if(defined $config->{standalone}) {
die "ERROR: --standalone replaced by --offline\n";
}
# connection settings for Ensembl Genomes
if($config->{genomes}) {
$config->{host} ||= 'mysql.ebi.ac.uk';
$config->{port} ||= 4157;
}
# connection settings for main Ensembl
else {
$config->{species} ||= "homo_sapiens";
$config->{host} ||= 'ensembldb.ensembl.org';
$config->{port} ||= 5306;
}
# refseq or core?
if(defined($config->{refseq})) {
$config->{core_type} = 'otherfeatures';
}
else {
$config->{core_type} = 'core';
}
# check one of database/cache/offline/build
if(!grep {defined($config->{$_})} qw(database cache offline build convert)) {
die qq{
IMPORTANT INFORMATION:
The VEP can read gene data from either a local cache or local/remote databases.
Using a cache is the fastest and most efficient way to use the VEP. The
included INSTALL.pl script can be used to fetch and set up cache files from the
Ensembl FTP server. Simply run "perl INSTALL.pl" and follow the instructions, or
see the documentation pages listed below.
If you have already set up a cache, use "--cache" or "--offline" to use it.
It is possible to use the public databases hosted at ensembldb.ensembl.org, but
this is slower than using the cache and concurrent and/or long running VEP jobs
can put strain on the Ensembl servers, limiting availability to other users.
To enable using databases, add the flag "--database".
Documentation
Installer: http://www.ensembl.org/info/docs/variation/vep/vep_script.html#installer
Cache: http://www.ensembl.org/info/docs/variation/vep/vep_script.html#cache
}
};
# output term
if(defined $config->{terms}) {
die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i;
if($config->{terms} =~ /ensembl|display/i) {
$config->{terms} = 'display';
}
else {
$config->{terms} = uc($config->{terms});
}
}
# everything?
if(defined($config->{everything})) {
my %everything = (
sift => 'b',
polyphen => 'b',
ccds => 1,
hgvs => 1,
hgnc => 1,
numbers => 1,
domains => 1,
regulatory => 1,
canonical => 1,
protein => 1,
biotype => 1,
gmaf => 1,
);
$config->{$_} = $everything{$_} for keys %everything;
# these ones won't work with offline
delete $config->{hgvs} if defined($config->{offline}) && !defined($config->{fasta_db});
}
# check nsSNP tools
foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) {
die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/;
#die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i;
die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel';
}
# force quiet if outputting to STDOUT
if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
delete $config->{verbose} if defined($config->{verbose});
$config->{quiet} = 1;
}
# individual(s) specified?
if(defined($config->{individual})) {
$config->{individual} = [split /\,/, $config->{individual}];
# force allow_non_variant
$config->{allow_non_variant} = 1;
}
# regulatory has to be on for cell_type
if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
$config->{regulatory} = 1;
$config->{cell_type} = [map {split /\,/, $_} @{$config->{cell_type}}];
}
# summarise options if verbose
if(defined $config->{verbose}) {
my $header =<<INTRO;
#----------------------------------#
# ENSEMBL VARIANT EFFECT PREDICTOR #
#----------------------------------#
version $VERSION
By Will McLaren (wm2\@ebi.ac.uk)
Configuration options:
INTRO
print $header;
my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
foreach my $key(sort keys %$config) {
next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0;
print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n";
}
print "\n".("-" x 20)."\n\n";
}
# check custom annotations
for my $i(0..$#{$config->{custom}}) {
my $custom = $config->{custom}->[$i];
my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom;
$type ||= 'exact';
$format ||= 'bed';
$coords ||= 0;
# check type
die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/;
# check format
die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/;
# bigwig format
if($format eq 'bigwig') {
# check for bigWigToWig
die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/;
}
else {
# check for tabix
die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/;
# remote files?
if($filepath =~ /tp\:\/\//) {
my $remote_test = `tabix $filepath 1:1-1 2>&1`;
if($remote_test =~ /fail/) {
die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n";
}
elsif($remote_test =~ /get_local_version/) {
debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet});
}
}
# check files exist
else {
die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath;
die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi';
}
}
$config->{custom}->[$i] = {
'file' => $filepath,
'name' => $shortname || 'CUSTOM'.($i + 1),
'type' => $type,
'format' => $format,
'coords' => $coords,
};
}
# check if using filter and original
die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter});
# filter by consequence?
if(defined($config->{filter})) {
my %filters = map {$_ => 1} split /\,/, $config->{filter};
# add in shortcuts
foreach my $filter(keys %filters) {
my $value = 1;
if($filter =~ /^no_/) {
delete $filters{$filter};
$filter =~ s/^no_//g;
$value = 0;
$filters{$filter} = $value;
}
if(defined($FILTER_SHORTCUTS{$filter})) {
delete $filters{$filter};
$filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}};
}
}
$config->{filter} = \%filters;
$config->{stats}->{filter_count} = 0;
}
# set defaults
$config->{user} ||= 'anonymous';
$config->{buffer_size} ||= 5000;
$config->{chunk_size} ||= '50kb';
$config->{output_file} ||= "variant_effect_output.txt";
$config->{stats_file} ||= $config->{output_file}."_summary.html";
$config->{tmpdir} ||= '/tmp';
$config->{format} ||= 'guess';
$config->{terms} ||= 'SO';
$config->{cache_region_size} ||= 1000000;
$config->{compress} ||= 'zcat';
# can't use a whole bunch of options with most_severe
if(defined($config->{most_severe})) {
foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) {
die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag});
}
}
# can't use a whole bunch of options with summary
if(defined($config->{summary})) {
foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) {
die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag});
}
}
# frequency filtering
if(defined($config->{filter_common})) {
$config->{check_frequency} = 1;
# set defaults
$config->{freq_freq} ||= 0.01;
$config->{freq_filter} ||= 'exclude';
$config->{freq_pop} ||= '1KG_ALL';
$config->{freq_gt_lt} ||= 'gt';
}
if(defined($config->{check_frequency})) {
foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag};
}
# need to set check_existing
$config->{check_existing} = 1;
}
$config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf} || defined $config->{maf_1kg};
# warn users still using whole_genome flag
if(defined($config->{whole_genome})) {
debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
}
$config->{whole_genome} = 1 unless defined $config->{no_whole_genome};
$config->{failed} = 0 unless defined $config->{failed};
$config->{chunk_size} =~ s/mb?/000000/i;
$config->{chunk_size} =~ s/kb?/000/i;
$config->{cache_region_size} =~ s/mb?/000000/i;
$config->{cache_region_size} =~ s/kb?/000/i;
# cluck and display executed SQL?
$Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck});
# offline needs cache, can't use HGVS
if(defined($config->{offline})) {
$config->{cache} = 1;
die("ERROR: Cannot generate HGVS coordinates in offline mode without a FASTA file (see --fasta)\n") if defined($config->{hgvs}) && !defined($config->{fasta_db});
die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs';
die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id';
die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency}) && $config->{freq_pop} !~ /1kg.*(all|afr|amr|asn|eur)/i;
die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv});
die("ERROR: Cannot check reference sequences without a FASTA file (see --fasta)\n") if defined($config->{check_ref}) && !defined($config->{fasta_db});
}
# write_cache needs cache
$config->{cache} = 1 if defined $config->{write_cache};
# no_slice_cache, prefetch and whole_genome have to be on to use cache
if(defined($config->{cache})) {
$config->{prefetch} = 1;
$config->{no_slice_cache} = 1;
$config->{whole_genome} = 1;
$config->{strip} = 1;
}
$config->{build} = $config->{rebuild} if defined($config->{rebuild});
# force options for full build
if(defined($config->{build})) {
$config->{prefetch} = 1;
$config->{hgnc} = 1;
$config->{no_slice_cache} = 1;
$config->{cache} = 1;
$config->{strip} = 1;
$config->{write_cache} = 1;
$config->{cell_type} = [1] if defined($config->{regulatory});
}
# connect to databases
$config->{reg} = &connect_to_dbs($config);
# complete dir with species name and db_version
$config->{dir} .= '/'.(
join '/', (
defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}),
$config->{db_version} || $config->{reg}->software_version
)
);
# warn user cache directory doesn't exist
if(!-e $config->{dir}) {
# if using write_cache
if(defined($config->{write_cache})) {
debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet});
}
# want to read cache, not found
elsif(defined($config->{cache})) {
die("ERROR: Cache directory ", $config->{dir}, " not found");
}
}
if(defined($config->{cache})) {
# read cache info
if(read_cache_info($config)) {
debug("Read existing cache info") unless defined $config->{quiet};
}
}
# we configure plugins here because they can sometimes switch on the
# regulatory config option
configure_plugins($config);
# include regulatory modules if requested
if(defined($config->{regulatory})) {
# do the use statements here so that users don't have to have the
# funcgen API installed to use the rest of the script
eval q{
use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
use Bio::EnsEMBL::Funcgen::MotifFeature;
use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
use Bio::EnsEMBL::Funcgen::BindingMatrix;
};
if($@) {
die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n");
}
}
# user defined custom output fields
if(defined($config->{fields})) {
$config->{fields} = [split ',', $config->{fields}];
debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet});
$config->{fields_redefined} = 1;
}
$config->{fields} ||= \@OUTPUT_COLS;
# suppress warnings that the FeatureAdpators spit if using no_slice_cache
Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
# get adaptors (don't get them in offline mode)
unless(defined($config->{offline})) {
if(defined($config->{cache}) && !defined($config->{write_cache})) {
# try and load adaptors from cache
if(!&load_dumped_adaptor_cache($config)) {
&get_adaptors($config);
&dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache});
}
# check cached adaptors match DB params
else {
my $dbc = $config->{sa}->{dbc};
my $ok = 1;
if($dbc->{_host} ne $config->{host}) {
# ens-livemirror, useastdb and ensembldb should all have identical DBs
unless(
(
$dbc->{_host} eq 'ens-livemirror'
|| $dbc->{_host} eq 'ensembldb.ensembl.org'
|| $dbc->{_host} eq 'useastdb.ensembl.org'
) && (
$config->{host} eq 'ens-livemirror'
|| $config->{host} eq 'ensembldb.ensembl.org'
|| $config->{host} eq 'useastdb.ensembl.org'
)
) {
$ok = 0;
}
unless(defined($config->{skip_db_check})) {
# but we still need to reconnect
debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet});
&get_adaptors($config);
}
}
if(!$ok) {
if(defined($config->{skip_db_check})) {
debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet});
}
else {
die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set";
}
}
}
}
else {
&get_adaptors($config);
&dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache});
}
# reg adaptors (only fetches if not retrieved from cache already)
&get_reg_adaptors($config) if defined($config->{regulatory});
}
# check cell types
if(defined($config->{cell_type}) && scalar @{$config->{cell_type}} && !defined($config->{build})) {
my $cls = '';
if(defined($config->{cache})) {
$cls = $config->{cache_cell_types};
}
else {
my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor();
$cls = join ",", map {$_->name} @{$cta->fetch_all};
}
foreach my $cl(@{$config->{cell_type}}) {
die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/;
}
}
# get terminal width for progress bars
unless(defined($config->{quiet})) {
my $width;
# module may not be installed
eval q{
use Term::ReadKey;
};
if(!$@) {
my ($w, $h);
# module may be installed, but e.g.
eval {
($w, $h) = GetTerminalSize();
};
$width = $w if defined $w;
}
$width ||= 60;
$width -= 12;
$config->{terminal_width} = $width;
}
# jump out to build cache if requested
if(defined($config->{build})) {
if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
}
# get 1KG freqs
if(defined($config->{'freq_file'})) {
my ($freq_file, @file_pops) = split /\,/, $config->{'freq_file'};
debug("Loading extra frequencies from $freq_file") unless defined($config->{quiet});
open IN, $freq_file or die "ERROR: Could not open frequencies file $freq_file\n";
while(<IN>) {
chomp;
my @data = split /\t/;
my $id = shift @data;
# sanity check
die("ERROR: column count in frequency file $freq_file does not match specified populations ".(join ",", @file_pops)."\n") unless scalar @data == scalar @file_pops;
$config->{'freqs'}->{$id} = join(" ", @data);
}
close IN;
# add pops to $config
$config->{'freq_file_pops'} = \@file_pops;
}
# build the cache
debug("Building cache for ".$config->{species}) unless defined($config->{quiet});
build_full_cache($config);
# exit script
debug("Finished building cache") unless defined($config->{quiet});
exit(0);
}
# warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG
if(defined($config->{cache})) {
# these two def depend on DB
foreach my $param(grep {defined $config->{$_}} qw(hgvs lrg check_sv check_ref)) {
debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}) or ($param =~ /hgvs|check_ref/ and defined($config->{fasta_db}));
}
debug("INFO: Database will be accessed when using --check_frequency with population ".$config->{freq_pop}) if !defined($config->{quiet}) and defined($config->{check_frequency}) && $config->{freq_pop} !~ /1kg.*(all|afr|amr|asn|eur)/i;
# as does using HGVS or IDs as input
debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet});
# the rest may be in the cache
foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) {
next if defined($config->{'cache_'.$param});
debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet});
}
}
# get list of chrs if supplied
if(defined($config->{chr})) {
my %chrs;
foreach my $val(split /\,/, $config->{chr}) {
my @nnn = split /\-/, $val;
foreach my $chr($nnn[0]..$nnn[-1]) {
$chrs{$chr} = 1;
}
}
$config->{chr} = \%chrs;
}
# get input file handle
$config->{in_file_handle} = &get_in_file_handle($config);
return $config;
}
# reads config from a file
sub read_config_from_file {
my $config = shift;
my $file = shift;
open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n";
while(<CONFIG>) {
next if /^\#/;
my @split = split /\s+|\=/;
my $key = shift @split;
$key =~ s/^\-//g;
if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') {
push @{$config->{$key}}, @split;
}
else {
$config->{$key} ||= $split[0];
}
}
close CONFIG;
# force quiet if outputting to STDOUT
if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
delete $config->{verbose} if defined($config->{verbose});
$config->{quiet} = 1;
}
debug("Read configuration from $file") unless defined($config->{quiet});
}
# configures custom VEP plugins
sub configure_plugins {
my $config = shift;
$config->{plugins} = [];
if (my @plugins = @{ $config->{plugin} }) {
# add the Plugins directory onto @INC
unshift @INC, $config->{toplevel_dir}."/Plugins";
for my $plugin (@plugins) {
# parse out the module name and parameters
my ($module, @params) = split /,/, $plugin;
# check we can use the module
eval qq{
use $module;
};
if ($@) {
debug("Failed to compile plugin $module: $@") unless defined($config->{quiet});
next;
}
# now check we can instantiate it, passing any parameters to the constructor
my $instance;
eval {
$instance = $module->new($config, @params);
};
if ($@) {
debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet});
next;
}
# check that the versions match
my $plugin_version;
if ($instance->can('version')) {
$plugin_version = $instance->version;
}
my $version_ok = 1;
if ($plugin_version) {
my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version;
my ($major, $minor, $maintenance) = split /\./, $VERSION;
if ($plugin_major != $major) {
debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet});
$version_ok = 0;
}
}
else {
debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet});
$version_ok = 0;
}
debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok;
# check that it implements all necessary methods
for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) {
unless ($instance->can($required)) {
debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet});
next;
}
}
# all's good, so save the instance in our list of plugins
push @{ $config->{plugins} }, $instance;
debug("Loaded plugin: $module") unless defined($config->{quiet});
# for convenience, check if the plugin wants regulatory stuff and turn on the config option if so
if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) {
debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet});
$config->{regulatory} = 1;
}
}
}
}
# connects to DBs (not done in offline mode)
sub connect_to_dbs {
my $config = shift;
# get registry
my $reg = 'Bio::EnsEMBL::Registry';
unless(defined($config->{offline})) {
# load DB options from registry file if given
if(defined($config->{registry})) {
debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet});
$reg->load_all(
$config->{registry},
$config->{verbose},
undef,
$config->{no_slice_cache}
);
}
# otherwise manually connect to DB server
else {
$reg->load_registry_from_db(
-host => $config->{host},
-user => $config->{user},
-pass => $config->{password},
-port => $config->{port},
-db_version => $config->{db_version},
-species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef,
-verbose => $config->{verbose},
-no_cache => $config->{no_slice_cache},
);
}
eval { $reg->set_reconnect_when_lost() };
if(defined($config->{verbose})) {
# get a meta container adaptors to check version
my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer');
my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer');
if($core_mca && $var_mca) {
debug(
"Connected to core version ", $core_mca->get_schema_version, " database ",
"and variation version ", $var_mca->get_schema_version, " database"
);
}
}
}
return $reg;
}
# get adaptors from DB
sub get_adaptors {
my $config = shift;
die "ERROR: No registry" unless defined $config->{reg};
# try fetching a slice adaptor
eval {
$config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice');
};
if($@) {
if($@ =~ /not find internal name for species/) {
my %register = %Bio::EnsEMBL::Registry::registry_register;
my @species_list = sort keys %{$register{_SPECIES}};
debug("ERROR: Could not find database for species ".$config->{species});
if(scalar @species_list) {
debug("List of valid species for this server:\n\n".(join "\n", @species_list));
}
die("\nExiting\n");
}
else {
die $@;
}
}
# get the remaining core adaptors
$config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene');
$config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript');
$config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer');
$config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem');
$config->{tra} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'translation');
# get variation adaptors
$config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature');
$config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature');
$config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation');
$config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix');
$config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation');
# get fake ones for species with no var DB
if(!defined($config->{vfa})) {
$config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species});
$config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species});
$config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species});
}
# cache schema version
$config->{mca}->get_schema_version if defined $config->{mca};
# check we got slice adaptor - can't continue without a core DB
die("ERROR: Could not connect to core database\n") unless defined $config->{sa};
}
# gets regulatory adaptors
sub get_reg_adaptors {
my $config = shift;
foreach my $type(@REG_FEAT_TYPES) {
next if defined($config->{$type.'_adaptor'});
my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type);
if(defined($adaptor)) {
$config->{$type.'_adaptor'} = $adaptor;
}
else {
delete $config->{regulatory};
last;
}
}
}
# gets file handle for input
sub get_in_file_handle {
my $config = shift;
# define the filehandle to read input from
my $in_file_handle = new FileHandle;
if(defined($config->{input_file})) {
# check defined input file exists
die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file};
if($config->{input_file} =~ /\.gz$/){
$in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
}
else {
$in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
}
}
# no file specified - try to read data off command line
else {
$in_file_handle = 'STDIN';
debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet};
}
return $in_file_handle;
}
# gets file handle for output and adds header
sub get_out_file_handle {
my $config = shift;
# define filehandle to write to
my $out_file_handle = new FileHandle;
# check if file exists
if(-e $config->{output_file} && !defined($config->{force_overwrite})) {
die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n");
}
# do same for stats file
if(-e $config->{stats_file} && !defined($config->{force_overwrite})) {
die("ERROR: Stats file ", $config->{stats_file}, " already exists. Specify a different output file with --stats_file or overwrite existing file with --force_overwrite\n");
}
if($config->{output_file} =~ /stdout/i) {
$out_file_handle = *STDOUT;
}
else {
$out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n");
}
# get stats file handle
die("ERROR: Stats file name ", $config->{stats_file}, " doesn't end in \".htm\" or \".html\" - some browsers may not be able to open this file\n") unless $config->{stats_file} =~ /htm(l)?$/;
my $stats_file_handle = new FileHandle;
$stats_file_handle->open(">".$config->{stats_file}) or die("ERROR: Could not write to stats file ", $config->{stats_file}, "\n");
$config->{stats_file_handle} = $stats_file_handle;
# HTML output?
my $html_file_handle;
if(defined($config->{html})) {
if(-e $config->{output_file}.'.html' && !defined($config->{force_overwrite})) {
die("ERROR: Stats file ", $config->{stats_file}, " already exists. Specify a different output file with --stats_file or overwrite existing file with --force_overwrite\n");
}
$html_file_handle = new FileHandle;
$html_file_handle->open(">".$config->{output_file}.'.html') or die("ERROR: Could not write to HTML file ", $config->{output_file}, ".html\n");
$config->{html_file_handle} = $html_file_handle;
print $html_file_handle html_head();
}
# define headers for a VCF file
my @vcf_headers = (
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO'
);
# file conversion, don't want to add normal headers
if(defined($config->{convert})) {
# header for VCF
if($config->{convert} =~ /vcf/i) {
print $out_file_handle "##fileformat=VCFv4.0\n";
print $out_file_handle join "\t", @vcf_headers;
print $out_file_handle "\n";
}
return $out_file_handle;
}
# GVF output, no header
elsif(defined($config->{gvf}) || defined($config->{original})) {
if(defined($config->{headers}) && defined($config->{original})) {
print $out_file_handle join "\n", @{$config->{headers}};
print $html_file_handle join("\n", @{$config->{headers}})."\n</pre>" if defined($config->{html});
}
return $out_file_handle;
}
elsif(defined($config->{vcf})) {
# create an info string for the VCF header
my @new_headers;
# if the user has defined the fields themselves, we don't need to worry
if(defined $config->{fields_redefined}) {
@new_headers = @{$config->{fields}};
}
else {
@new_headers = (
# get default headers, minus variation name and location (already encoded in VCF)
grep {
$_ ne 'Uploaded_variation' and
$_ ne 'Location' and
$_ ne 'Extra'
} @{$config->{fields}},
# get extra headers
map {@{$extra_headers{$_}}}
grep {defined $config->{$_}}
keys %extra_headers
);
# plugin headers
foreach my $plugin_header(split /\n/, get_plugin_headers($config)) {
$plugin_header =~ /\#\# (.+?)\t\:.+/;
push @new_headers, $1;
}
# redefine the main headers list in config
$config->{fields} = \@new_headers;
}
# add the newly defined headers as a header to the VCF
my $string = join '|', @{$config->{fields}};
my @vcf_info_strings = ('##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: '.$string.'">');
# add custom headers
foreach my $custom(@{$config->{custom}}) {
push @vcf_info_strings, '##INFO=<ID='.$custom->{name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">';
}
# if this is already a VCF file, we need to add our new headers in the right place
if(defined($config->{headers})) {
for my $i(0..$#{$config->{headers}}) {
if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) {
splice(@{$config->{headers}}, $i, 0, @vcf_info_strings);
}
}
print $out_file_handle join "\n", @{$config->{headers}};
print $out_file_handle "\n";
if(defined($config->{html})) {
my @tmp = @{$config->{headers}};
my @cols = split /\s+/, pop @tmp;
print $html_file_handle join "\n", @tmp;
print $html_file_handle "\n";
print $html_file_handle html_table_headers($config, \@cols);
}
}
else {
print $out_file_handle "##fileformat=VCFv4.0\n";
print $out_file_handle join "\n", @vcf_info_strings;
print $out_file_handle "\n";
print $out_file_handle join "\t", @vcf_headers;
print $out_file_handle "\n";
if(defined($config->{html})) {
print $html_file_handle "##fileformat=VCFv4.0\n";
print $html_file_handle join "\n", @vcf_info_strings;
print $html_file_handle "\n";
print $html_file_handle html_table_headers($config, \@vcf_headers);
}
}
return $out_file_handle;
}
# make header
my $time = &get_time;
my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca};
$db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache});
my $version_string =
"Using API version ".$config->{reg}->software_version.
", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?');
# sift/polyphen versions
foreach my $tool(qw(sift polyphen)) {
if(defined($config->{$tool})) {
my $string = 'config_'.$tool.'_version';
if(!defined($config->{$string}) && !defined($config->{offline})) {
my $var_mca = $config->{reg}->get_adaptor($config->{species}, 'variation', 'metacontainer');
my $values = $var_mca->list_value_by_key($tool.'_version') if defined($var_mca);
$config->{$string} = $values->[0] if scalar @$values;
}
$version_string .= "\n## $tool version ".$config->{$string} if defined($config->{$string});
}
}
# add key for extra column headers based on config
my $extra_column_keys = join "\n",
map {'## '.$_.' : '.$extra_descs{$_}}
sort map {@{$extra_headers{$_}}}
grep {defined $config->{$_}}
keys %extra_headers;
my $header =<<HEAD;
## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION
## Output produced at $time
## Connected to $db_string
## $version_string
## Extra column keys:
$extra_column_keys
HEAD
$header .= get_plugin_headers($config);
# add headers
print $out_file_handle $header;
print $html_file_handle $header if defined($config->{html});
# add custom data defs
if(defined($config->{custom})) {
foreach my $custom(@{$config->{custom}}) {
print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n";
print $html_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n" if defined($config->{html});
}
}
# add column headers
print $out_file_handle '#', (join "\t", @{$config->{fields}});
print $out_file_handle "\n";
if(defined($config->{html})) {
print $html_file_handle html_table_headers($config, $config->{fields});
}
return $out_file_handle;
}
sub get_plugin_headers {
my $config = shift;
my $header = "";
for my $plugin (@{ $config->{plugins} }) {
if (my $hdr = $plugin->get_header_info) {
for my $key (keys %$hdr) {
my $val = $hdr->{$key};
$header .= "## $key\t: $val\n";
}
}
}
return $header;
}
# convert a variation feature to a line of output
sub convert_vf {
my $config = shift;
my $vf = shift;
my $convert_method = 'convert_to_'.lc($config->{convert});
my $method_ref = \&$convert_method;
my $line = &$method_ref($config, $vf);
my $handle = $config->{out_file_handle};
if(scalar @$line) {
print $handle join "\t", @$line;
print $handle "\n";
}
}
# converts to Ensembl format
sub convert_to_ensembl {
my $config = shift;
my $vf = shift;
return [
$vf->{chr} || $vf->seq_region_name,
$vf->start,
$vf->end,
$vf->allele_string,
$vf->strand,
$vf->variation_name
];
}
# converts to pileup format
sub convert_to_pileup {
my $config = shift;
my $vf = shift;
# look for imbalance in the allele string
my %allele_lengths;
my @alleles = split /\//, $vf->allele_string;
foreach my $allele(@alleles) {
$allele =~ s/\-//g;
$allele_lengths{length($allele)} = 1;
}
# in/del
if(scalar keys %allele_lengths > 1) {
if($vf->allele_string =~ /\-/) {
# insertion?
if($alleles[0] eq '-') {
shift @alleles;
for my $i(0..$#alleles) {
$alleles[$i] =~ s/\-//g;
$alleles[$i] = '+'.$alleles[$i];
}
}
else {
@alleles = grep {$_ ne '-'} @alleles;
for my $i(0..$#alleles) {
$alleles[$i] =~ s/\-//g;
$alleles[$i] = '-'.$alleles[$i];
}
}
@alleles = grep {$_ ne '-' && $_ ne '+'} @alleles;
return [
$vf->{chr} || $vf->seq_region_name,
$vf->start - 1,
'*',
(join "/", @alleles),
];
}
else {
warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet});
return [];
}
}
# balanced sub
else {
return [
$vf->{chr} || $vf->seq_region_name,
$vf->start,
shift @alleles,
(join "/", @alleles),
];
}
}
# converts to HGVS (hackily returns many lines)
sub convert_to_hgvs {
my $config = shift;
my $vf = shift;
# ensure we have a slice
$vf->{slice} ||= get_slice($config, $vf->{chr});
my $tvs = $vf->get_all_TranscriptVariations;
my @return = values %{$vf->get_all_hgvs_notations()};
if(defined($tvs)) {
push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs;
push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs;
}
return [join "\n", @return];
}
# prints a line of output from the hash
sub print_line {
my $config = shift;
my $line = shift;
return unless defined($line);
my $output;
my $html_fh = $config->{html_file_handle};
# normal
if(ref($line) eq 'HASH') {
my %extra = %{$line->{Extra}};
$line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} };
# if the fields have been redefined we need to search through in case
# any of the defined fields are actually part of the Extra hash
$output = join "\t", map {
(defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-'))
} @{$config->{fields}};
if(defined($config->{html})) {
print $html_fh Tr(
map {td($_)}
map {linkify($config, $_)}
map {
(defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-'))
}
@{$config->{fields}}
);
}
}
# gvf/vcf
else {
$output = $$line;
}
my $fh = $config->{out_file_handle};
print $fh "$output\n";
}
sub summarise_stats {
my $config = shift;
# convert gene and transcript hashes to counts
for my $type(qw(genes transcripts regfeats)) {
$config->{stats}->{$type} = scalar keys %{$config->{stats}->{$type}} if defined $config->{stats}->{$type};
}
# tot up chromosome counts
foreach my $chr(keys %{$config->{stats}->{chr}}) {
$config->{stats}->{chr_totals}->{$chr} += $config->{stats}->{chr}->{$chr}->{$_} for keys %{$config->{stats}->{chr}->{$chr}};
my $start = 0;
my %tmp;
while($start <= $config->{chr_lengths}->{$chr}) {
$tmp{$start / 1e6} = $config->{stats}->{chr}->{$chr}->{$start} || 0;
$start += 1e6;
}
$config->{stats}->{chr}->{$chr} = \%tmp;
}
# convert allele changes to Ts/Tv
map {$config->{stats}->{ts_tv}->{$ts_tv{$_}} += $config->{stats}->{allele_changes}->{$_}} keys %{$config->{stats}->{allele_changes}} if defined($config->{stats}->{allele_changes});
# flesh out protein_pos
if(defined($config->{stats}->{protein_pos})) {
if(defined($config->{stats}->{protein_pos}->{10})) {
$config->{stats}->{protein_pos}->{9} += $config->{stats}->{protein_pos}->{10};
delete $config->{stats}->{protein_pos}->{10};
}
$config->{stats}->{protein_pos}->{$_} ||= 0 for (0..9);
my %tmp = map {$_.'0-'.($_+1).'0%' => $config->{stats}->{protein_pos}->{$_}} keys %{$config->{stats}->{protein_pos}};
$config->{stats}->{protein_pos} = \%tmp;
}
# coding cons
foreach my $con(qw(missense_variant synonymous_variant coding_sequence_variant stop_lost stop_gained frameshift_variant inframe_insertion inframe_deletion)) {
$config->{stats}->{coding}->{$con} = $config->{stats}->{consequences}->{$con} if defined($config->{stats}->{consequences}->{$con});
}
# get ranks to sort
my %cons_ranks = map { $_->{SO_term} => $_->{rank} } values %Bio::EnsEMBL::Variation::Utils::Constants::OVERLAP_CONSEQUENCES;
# create pie chart hashes
my @charts = (
{
id => 'var_class',
title => 'Variant classes',
header => ['Variant class', 'Count'],
data => $config->{stats}->{classes},
type => 'pie',
sort => 'value',
height => 200,
},
{
id => 'consequences',
title => 'Variant consequences',
header => ['Consequence type', 'Count'],
data => $config->{stats}->{consequences},
type => 'pie',
sort => \%cons_ranks,
colours => $colour_keys{consequences},
},
{
id => 'coding',
title => 'Coding consequences',
header => ['Consequence type', 'Count'],
data => $config->{stats}->{coding},
type => 'pie',
sort => \%cons_ranks,
colours => $colour_keys{consequences},
}
);
foreach my $tool(qw(SIFT PolyPhen)) {
my $lc_tool = lc($tool);
push @charts, {
id => $lc_tool,
title => $tool.' summary',
header => ['Prediction', 'Count'],
data => $config->{stats}->{$tool},
type => 'pie',
height => 200,
sort => 'value',
colours => $colour_keys{$lc_tool},
} if defined($config->{$lc_tool});
}
push @charts, {
id => 'chr',
title => 'Variants by chromosome',
header => ['Chromosome','Count'],
data => $config->{stats}->{chr_totals},
sort => 'chr',
type => 'bar',
options => '{legend: {position: "none"}}',
};
foreach my $chr(sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %{$config->{stats}->{chr}}) {
push @charts, {
id => 'chr_'.$chr,
title => 'Distribution of variants on chromosome '.$chr,
header => ['Position (mb)', 'Count'],
data => $config->{stats}->{chr}->{$chr},
sort => 'chr',
type => 'line',
options => '{hAxis: {title: "Position (mb)"}, legend: {position: "none"}}',
no_table => 1,
no_link => 1,
};
}
push @charts, {
id => 'protein',
title => 'Position in protein',
header => ['Position in protein (percentile)','Count'],
data => $config->{stats}->{protein_pos},
sort => 'chr',
type => 'bar',
no_table => 1,
options => '{hAxis: {title: "Position in protein (percentile)", textStyle: {fontSize: 10}}, legend: {position: "none"}}',
};
my $fh = $config->{stats_file_handle};
print $fh stats_html_head($config, \@charts);
# create menu
print $fh div(
{class => 'sidemenu'},
div(
{class => 'sidemenu_head'},
"Links"
),
div(
{class => 'sidemenu_body'},
ul(
li([
a({href => '#masthead'}, "Top of page"),
a({href => '#run_stats'}, "VEP run statistics"),
a({href => '#gen_stats'}, "General statistics"),
map {
a({href => '#'.$_->{id}}, $_->{title})
} grep { !$_->{no_link} } @charts,
])
),
)
);
print $fh "<div class='main_content'>";
# run stats
print $fh h3({id => 'run_stats'}, "VEP run statistics");
my @rows = (
['VEP version (API)', $VERSION.' ('.$config->{reg}->software_version.')'],
['Cache/Database', ($config->{cache} ? $config->{dir} : ($config->{mca} ? $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host : '?'))],
['Species', $config->{species}],
['Command line options', pre(join(" ", @{$config->{stats}->{options}}))],
['Start time', $config->{stats}->{start_time}],
['End time', $config->{stats}->{end_time}],
['Run time', $config->{stats}->{run_time}." seconds"],
['Input file (format)', $config->{input_file}.' ('.uc($config->{format}).')'],
[
'Output file',
$config->{output_file}.
(defined($config->{html}) ? ' '.a({href => $config->{output_file}.'.html'}, '[HTML]') : '').
' '.a({href => $config->{output_file}}, '[text]')
],
);
print $fh table({class => 'stats_table'}, Tr([map {td($_)} @rows]));
# vars in/out stats
print $fh h3({id => 'gen_stats'}, "General statistics");
@rows = (
['Lines of input read', $config->{line_number}],
['Variants processed', $config->{stats}->{var_count}],
['Variants remaining after filtering', $config->{stats}->{filter_count}],
[
'Novel / known variants',
defined($config->{stats}->{existing}) ?
sprintf("%s (%.1f\%) / %s (%.1f\%)",
$config->{stats}->{var_count} - $config->{stats}->{existing},
100 * (($config->{stats}->{var_count} - $config->{stats}->{existing}) / $config->{stats}->{var_count}),
$config->{stats}->{existing},
100 * ($config->{stats}->{existing} / $config->{stats}->{var_count}),
)
: '-'
],
['Overlapped genes', $config->{stats}->{genes}],
['Overlapped transcripts', $config->{stats}->{transcripts}],
['Overlapped regulatory features', $config->{stats}->{regfeats} || '-'],
);
print $fh table({class => 'stats_table'}, Tr([map {td($_)} @rows]));
foreach my $chart(@charts) {
my $height = $chart->{height} || ($chart->{type} eq 'pie' ? '400' : '200');
print $fh hr();
print $fh h3({id => $chart->{id}}, $chart->{title});
print $fh div({id => $chart->{id}."_".$chart->{type}, style => 'width: 800px; height: '.$height.'px'}, ' ');
print $fh div({id => $chart->{id}."_table", style => 'width: 800px; height: 200px'}, ' ') unless $chart->{no_table};
}
print $fh '</div>';
print $fh stats_html_tail();
$config->{stats_file_handle}->close;
}
sub stats_html_head {
my $config = shift;
my $charts = shift;
my ($js);
foreach my $chart(@$charts) {
my @keys;
# sort data
if(defined($chart->{sort})) {
if($chart->{sort} eq 'chr') {
@keys = sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %{$chart->{data}};
}
elsif($chart->{sort} eq 'value') {
@keys = sort {$chart->{data}->{$a} <=> $chart->{data}->{$b}} keys %{$chart->{data}};
}
elsif(ref($chart->{sort}) eq 'HASH') {
@keys = sort {$chart->{sort}->{$a} <=> $chart->{sort}->{$b}} keys %{$chart->{data}};
}
}
else {
@keys = keys %{$chart->{data}};
}
my $type = ucfirst($chart->{type});
# add colour
if(defined($chart->{colours})) {
my $co = 'slices: ['.join(", ", map { $chart->{colours}->{$_} ? '{color: "'.$chart->{colours}->{$_}.'"}' : '{}' } @keys).']';
if(defined($chart->{options})) {
$chart->{options} =~ s/}$/, $co}/;
}
else {
$chart->{options} = "{$co}";
}
}
# code to draw chart
$js .= sprintf(
"var %s = draw$type('%s', '%s', google.visualization.arrayToDataTable([['%s','%s'],%s]), %s);\n",
$chart->{id}.'_'.$chart->{type},
$chart->{id}.'_'.$chart->{type},
$chart->{title},
$chart->{header}->[0], $chart->{header}->[1],
join(",", map {"['".$_."',".$chart->{data}->{$_}."]"} @keys),
$chart->{options} || 'null',
);
unless($chart->{no_table}) {
# code to draw table
$js .= sprintf(
"var %s = drawTable('%s', '%s', google.visualization.arrayToDataTable([['%s','%s'],%s]));\n",
$chart->{id}.'_table',
$chart->{id}.'_table',
$chart->{title},
$chart->{header}->[0], $chart->{header}->[1],
join(",", map {"['".$_."',".$chart->{data}->{$_}."]"} @keys)
);
# interaction between table/chart
$js .= sprintf(
qq{
google.visualization.events.addListener(%s, 'select', function() {
%s.setSelection(%s.getSelection());
});
google.visualization.events.addListener(%s, 'select', function() {
%s.setSelection(%s.getSelection());
});
},
$chart->{id}.'_'.$chart->{type},
$chart->{id}.'_table',
$chart->{id}.'_'.$chart->{type},
$chart->{id}.'_table',
$chart->{id}.'_'.$chart->{type},
$chart->{id}.'_table',
);
}
}
my $html =<<SHTML;
<html>
<head>
<title>VEP summary</title>
<script type="text/javascript" src="http://www.google.com/jsapi"></script>
<script type="text/javascript">
google.load('visualization', '1', {packages: ['corechart','table']});
</script>
<script type="text/javascript">
function init() {
// charts
$js
}
function drawPie(id, title, data, options) {
var pie = new google.visualization.PieChart(document.getElementById(id));
pie.draw(data, options);
return pie;
}
function drawBar(id, title, data, options) {
var bar = new google.visualization.ColumnChart(document.getElementById(id));
bar.draw(data, options);
return bar;
}
function drawTable(id, title, data) {
var table = new google.visualization.Table(document.getElementById(id));
table.draw(data, null);
return table;
}
function drawLine(id, title, data, options) {
var line = new google.visualization.LineChart(document.getElementById(id));
line.draw(data, options);
return line;
}
google.setOnLoadCallback(init);
</script>
<style type="text/css">
body {
font-family: arial, sans-serif;
margin: 0px;
padding: 0px;
}
a {color: #36b;}
a.visited {color: #006;}
.stats_table {
margin: 5px;
}
tr:nth-child(odd) {
background-color: rgb(238, 238, 238);
}
td {
padding: 5px;
}
td:nth-child(odd) {
font-weight: bold;
}
h3 {
color: #666;
}
.masthead {
background-color: rgb(51, 51, 102);
color: rgb(204, 221, 255);
height: 80px;
width: 100%;
padding: 0px;
}
.main {
padding: 10px;
}
.gradient {
background: #333366; /* Old browsers */
background: -moz-linear-gradient(left, #333366 0%, #ffffff 100%); /* FF3.6+ */
background: -webkit-gradient(linear, left top, right top, color-stop(0%,#333366), color-stop(100%,#ffffff)); /* Chrome,Safari4+ */
background: -webkit-linear-gradient(left, #333366 0%,#ffffff 100%); /* Chrome10+,Safari5.1+ */
background: -o-linear-gradient(left, #333366 0%,#ffffff 100%); /* Opera 11.10+ */
background: -ms-linear-gradient(left, #333366 0%,#ffffff 100%); /* IE10+ */
background: linear-gradient(to right, #333366 0%,#ffffff 100%); /* W3C */
filter: progid:DXImageTransform.Microsoft.gradient( startColorstr='#333366', endColorstr='#ffffff',GradientType=1 ); /* IE6-9 */
padding: 0px;
height: 80px;
width: 500px;
float: right;
display: inline;
}
.main_content {
margin-left: 300px;
}
.sidemenu {
width: 260px;
position: fixed;
border-style: solid;
border-width: 2px;
border-color: rgb(51, 51, 102);
}
.sidemenu_head {
width: 250px;
background-color: rgb(51, 51, 102);
color: rgb(204, 221, 255);
padding: 5px;
}
.sidemenu_body {
width: 250px;
padding: 5px;
}
</style>
</head>
<body>
<div id="masthead" class="masthead">
<div style="float: left; display: inline; padding: 10px; height: 80px;">
<a href="http://www.ensembl.org/"><img src="http://static.ensembl.org/i/e-ensembl.png"></a>
</div>
<div style="float: right; display: inline; height: 80px; background: white; padding: 10px;">
<a href="http://www.ensembl.org/info/docs/variation/vep/vep_script.html"><img src="http://www.ensembl.org/img/vep_logo.png"></a>
</div>
<div class="gradient">
</div>
</div>
<div class="main">
SHTML
return $html;
}
sub stats_html_tail {
return "\n</div></body>\n</html>\n";
}
sub html_head {
my $txt_file = $config->{output_file};
my $stats_file = $config->{stats_file};
my $html =<<HTML;
<html>
<head>
<title>VEP output</title>
<script type="text/javascript" language="javascript" src="http://www.datatables.net/media/javascript/complete.min.js"></script>
<script class="jsbin" src="http://datatables.net/download/build/jquery.dataTables.nightly.js"></script>
<script type="text/javascript" language="javascript">
\$(document).ready(function() {
\$('#data').dataTable({
"sPaginationType": "full_numbers"
});
});
function fnShowHide( iCol ) {
/* Get the DataTables object again - this is not a recreation, just a get of the object */
var oTable = \$('#data').dataTable();
var bVis = oTable.fnSettings().aoColumns[iCol].bVisible;
oTable.fnSetColumnVis( iCol, bVis ? false : true );
}
function showAllCols() {
var oTable = \$('#data').dataTable();
for (var i=0;i<oTable.fnSettings().aoColumns.length;i++) {
oTable.fnSetColumnVis(i, true);
}
}
function showHide(lyr) {
var lyrobj = document.getElementById(lyr);
if(lyrobj.style.height == "0px") {
lyrobj.style.height = "";
lyrobj.style.display = "";
}
else {
lyrobj.style.height = "0px";
lyrobj.style.display = "none";
}
}
</script>
<style type="text/css">
\@import "http://www.datatables.net/release-datatables/media/css/demo_table.css";
body {
font-family: arial, sans-serif;
margin: 0px;
padding: 0px;
}
a {color: #36b;}
a.visited {color: #006;}
th {
font-size: 11px;
}
td {
font-size: 11px;
}
.masthead {
background-color: rgb(51, 51, 102);
color: rgb(204, 221, 255);
height: 80px;
width: 100%;
padding: 0px;
}
.main {
padding: 10px;
}
.gradient {
background: #333366; /* Old browsers */
background: -moz-linear-gradient(left, #333366 0%, #ffffff 100%); /* FF3.6+ */
background: -webkit-gradient(linear, left top, right top, color-stop(0%,#333366), color-stop(100%,#ffffff)); /* Chrome,Safari4+ */
background: -webkit-linear-gradient(left, #333366 0%,#ffffff 100%); /* Chrome10+,Safari5.1+ */
background: -o-linear-gradient(left, #333366 0%,#ffffff 100%); /* Opera 11.10+ */
background: -ms-linear-gradient(left, #333366 0%,#ffffff 100%); /* IE10+ */
background: linear-gradient(to right, #333366 0%,#ffffff 100%); /* W3C */
filter: progid:DXImageTransform.Microsoft.gradient( startColorstr='#333366', endColorstr='#ffffff',GradientType=1 ); /* IE6-9 */
padding: 0px;
height: 80px;
width: 500px;
float: right;
display: inline;
}
</style>
</head>
<body>
<div id="masthead" class="masthead">
<div style="float: left; display: inline; padding: 10px; height: 80px;">
<a href="http://www.ensembl.org/"><img src="http://static.ensembl.org/i/e-ensembl.png"></a>
</div>
<div style="float: right; display: inline; height: 80px; background: white; padding: 10px;">
<a href="http://www.ensembl.org/info/docs/variation/vep/vep_script.html"><img src="http://www.ensembl.org/img/vep_logo.png"></a>
</div>
<div class="gradient">
</div>
</div>
<div class="main">
<p>
View: <a href="$stats_file">Summary statistics</a> |
<a href="$txt_file">as text</a> |
<a href="javascript:void();" onclick="showHide('header')">Show/hide header</a> |
<a href="javascript:void();" onclick="showAllCols()">Restore columns</a>
</p>
<hr/>
<pre id="header" style="height:0px; display: none;">
HTML
return $html;
}
sub html_table_headers {
my $config = shift;
my $cols = shift;
my @cols_copy = @$cols;
my $html = qq{</pre><table id="data" class="display"><thead><tr>};
$config->{_th} = join("", map {
$cols_copy[$_] =~ s/\_/ /g;
'<th>'.$cols_copy[$_].' '.a({href => 'javascript:void();', onclick => "fnShowHide($_);"}, img({src => 'http://www.ensembl.org/i/16/cross.png', height => 6, width => 6, style => 'border: 1px solid gray; padding: 1px;'})).'</th>'
} (0..$#cols_copy));
$html .= $config->{_th};
$html .= qq{</thead></tr><tbody>};
return $html;
}
sub linkify {
my $config = shift;
my $string = shift;
my $species = ucfirst($config->{species});
# Ensembl genes
$string =~ s/(ENS.{0,3}G\d+|CCDS\d+\.?\d+?|N[MP]_\d+\.?\d+?)/a({href => "http:\/\/www.ensembl.org\/$species\/Gene\/Summary\?g=$1", target => "_blank"}, $1)/ge;
# Ensembl transcripts
$string =~ s/(ENS.{0,3}T\d+)/a({href => "http:\/\/www.ensembl.org\/$species\/Transcript\/Summary\?t=$1", target => "_blank"}, $1)/ge;
# Ensembl regfeats
$string =~ s/(ENS.{0,3}R\d+)/a({href => "http:\/\/www.ensembl.org\/$species\/Regulation\/Summary\?rf=$1", target => "_blank"}, $1)/ge;
# variant identifiers
$string =~ s/(rs\d+|COSM\d+|C[DMIX]\d+)/a({href => "http:\/\/www.ensembl.org\/$species\/Variation\/Summary\?v=$1", target => "_blank"}, $1)/gie;
# locations
while($string =~ m/(^[A-Z\_\d]+?:[1-9]\d+)(\-\d+)?/g) {
my $loc = $1.($2 ? $2 : '');
my ($chr, $start, $end) = split /\-|\:/, $loc;
$end ||= $start;
# adjust +/- 1kb
$start -= 1000;
$end += 1000;
my $link = a({href => "http://www.ensembl.org/$species/Location/View?r=$chr:$start\-$end", target => "_blank"}, $string);
$string =~ s/$loc/$link/;
}
# split strings
$string =~ s/([,;])/$1 /g;
return $string;
}
# outputs usage message
sub usage {
my $usage =<<END;
#----------------------------------#
# ENSEMBL VARIANT EFFECT PREDICTOR #
#----------------------------------#
version $VERSION
By Will McLaren (wm2\@ebi.ac.uk)
http://www.ensembl.org/info/docs/variation/vep/vep_script.html
Usage:
perl variant_effect_predictor.pl [arguments]
Options
=======
--help Display this message and quit
--verbose Display verbose output as the script runs [default: off]
--quiet Suppress status and warning messages [default: off]
--no_progress Suppress progress bars [default: off]
--config Load configuration from file. Any command line options
specified overwrite those in the file [default: off]
--everything Shortcut switch to turn on commonly used options. See web
documentation for details [default: off]
--fork [num_forks] Use forking to improve script runtime [default: off]
-i | --input_file Input file - if not specified, reads from STDIN. Files
may be gzip compressed.
--format Specify input file format - one of "ensembl", "pileup",
"vcf", "hgvs", "id" or "guess" to try and work out format.
-o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this
will force --quiet [default: "variant_effect_output.txt"]
--force_overwrite Force overwriting of output file [default: quit if file
exists]
--original Writes output as it was in input - must be used with --filter
since no consequence data is added [default: off]
--vcf Write output as VCF [default: off]
--gvf Write output as GVF [default: off]
--html Write output also as HTML (filename: [output_file].html)
--stats_file Specify stats summary file [default: [output_file]_summary.html]
--no_stats Don't write stats summary file
--fields [field list] Define a custom output format by specifying a comma-separated
list of field names. Field names normally present in the
"Extra" field may also be specified, including those added by
plugin modules. Can also be used to configure VCF output
columns [default: off]
--species [species] Species to use [default: "human"]
-t | --terms Type of consequence terms to output - one of "SO", "ensembl"
[default: SO]
--sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off]
--polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off]
NB: SIFT predictions are only available for some species, PolyPhen for human only
NB: Condel support has been moved to a VEP plugin module - see documentation
--regulatory Look for overlaps with regulatory regions. The script can
also call if a variant falls in a high information position
within a transcription factor binding site. Output lines have
a Feature type of RegulatoryFeature or MotifFeature
[default: off]
--cell_type [types] Report only regulatory regions that are found in the given cell
type(s). Can be a single cell type or a comma-separated list.
The functional type in each cell type is reported under
CELL_TYPE in the output. To retrieve a list of cell types, use
"--cell_type list" [default: off]
NB: Regulatory consequences are currently available for human and mouse only
--custom [file list] Add custom annotations from tabix-indexed files. See
documentation for full details [default: off]
--plugin [plugin_name] Use named plugin module [default: off]
--hgnc Add HGNC gene identifiers to output [default: off]
--hgvs Output HGVS identifiers (coding and protein). Requires database
connection or --fasta [default: off]
--ccds Output CCDS transcript identifiers [default: off]
--xref_refseq Output aligned RefSeq mRNA identifier for transcript. NB: the
RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND
FREQUENTLY WILL NOT, match exactly in sequence, exon structure
and protein product [default: off]
--protein Output Ensembl protein identifer [default: off]
--biotype Output transcript biotype [default: off]
--canonical Indicate if the transcript for this consequence is the canonical
transcript for this gene [default: off]
--domains Include details of any overlapping protein domains [default: off]
--numbers Include exon & intron numbers [default: off]
--no_intergenic Excludes intergenic consequences from the output [default: off]
--coding_only Only return consequences that fall in the coding region of
transcripts [default: off]
--most_severe Ouptut only the most severe consequence per variation.
Transcript-specific columns will be left blank. [default: off]
--summary Output only a comma-separated list of all consequences per
variation. Transcript-specific columns will be left blank.
[default: off]
--per_gene Output only the most severe consequence per gene. Where more
than one transcript has the same consequence, the transcript
chosen is arbitrary. [default: off]
--check_ref If specified, checks supplied reference allele against stored
entry in Ensembl Core database [default: off]
--check_existing If specified, checks for existing co-located variations in the
Ensembl Variation database [default: off]
--failed [0|1] Include (1) or exclude (0) variants that have been flagged as
failed by Ensembl when checking for existing variants.
[default: exclude]
--check_alleles If specified, the alleles of existing co-located variations
are compared to the input; an existing variation will only
be reported if no novel allele is in the input (strand is
accounted for) [default: off]
--check_svs Report overlapping structural variants [default: off]
--filter [filters] Filter output by consequence type. Use this to output only
variants that have at least one consequence type matching the
filter. Multiple filters can be used separated by ",". By
combining this with --original it is possible to run the VEP
iteratively to progressively filter a set of variants. See
documentation for full details [default: off]
--filter_common Shortcut flag for the filters below - this will exclude
variants that have a co-located existing variant with global
MAF > 0.01 (1%). May be modified using any of the following
freq_* filters. For human, this can be used in offline mode
for the following populations: 1KG_ALL, 1KG_AFR, 1KG_AMR,
1KG_ASN, 1KG_EUR
--check_frequency Turns on frequency filtering. Use this to include or exclude
variants based on the frequency of co-located existing
variants in the Ensembl Variation database. You must also
specify all of the following --freq flags [default: off]
--freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap,
1kg_yri for YRI 1000 genomes. See documentation for more
details
--freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5
--freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or
less than (lt) --freq_freq
--freq_filter Specify whether variants that pass the above should be included
[exclude|include] or excluded from analysis
--gmaf Include global MAF of existing variant from 1000 Genomes
Phase 1 in output
--maf_1kg Include MAF from continental populations (AFR,AMR,ASN,EUR) of
1000 Genomes Phase 1 in output
--individual [id] Consider only alternate alleles present in the genotypes of the
specified individual(s). May be a single individual, a comma-
separated list or "all" to assess all individuals separately.
Each individual and variant combination is given on a separate
line of output. Only works with VCF files containing individual
genotype data; individual IDs are taken from column headers.
--allow_non_variant Prints out non-variant lines when using VCF input
--phased Force VCF individual genotypes to be interpreted as phased.
For use with plugins that depend on phased state.
--chr [list] Select a subset of chromosomes to analyse from your file. Any
data not on this chromosome in the input will be skipped. The
list can be comma separated, with "-" characters representing
a range e.g. 1-5,8,15,X [default: off]
--gp If specified, tries to read GRCh37 position from GP field in the
INFO column of a VCF file. Only applies when VCF is the input
format and human is the species [default: off]
--convert Convert the input file to the output format specified.
[ensembl|vcf|pileup] Converted output is written to the file specified in
--output_file. No consequence calculation is carried out when
doing file conversion. [default: off]
--cache Enables read-only use of cache [default: off]
--dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"]
--fasta [file|dir] Specify a FASTA file or a directory containing FASTA files
to use to look up reference sequence. The first time you
run the script with this parameter an index will be built
which can take a few minutes. This is required if
fetching HGVS annotations (--hgvs) or checking reference
sequences (--check_ref) in offline mode (--offline), and
optional with some performance increase in cache mode
(--cache). See documentation for more details
--refseq Use the otherfeatures database to retrieve transcripts - this
database contains RefSeq transcripts (as well as CCDS and
Ensembl EST alignments) [default: off]
--database Enable using databases [default: off]
--host Manually define database host [default: "ensembldb.ensembl.org"]
-u | --user Database username [default: "anonymous"]
--port Database port [default: 5306]
--password Database password [default: no password]
--genomes Sets DB connection params for Ensembl Genomes [default: off]
--registry Registry file to use defines DB connections [default: off]
Defining a registry file overrides above connection settings.
--db_version=[number] Force script to load DBs from a specific Ensembl version. Not
advised due to likely incompatibilities between API and DB
--write_cache Enable writing to cache [default: off]
--build [all|list] Build a complete cache for the selected species. Build for all
chromosomes with --build all, or a list of chromosomes (see
--chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS
VIOLATES OUR FAIR USAGE POLICY [default: off]
--compress Specify utility to decompress cache files - may be "gzcat" or
"gzip -dc" Only use if default does not work [default: zcat]
--skip_db_check ADVANCED! Force the script to use a cache built from a different
database than specified with --host. Only use this if you are
sure the hosts are compatible (e.g. ensembldb.ensembl.org and
useastdb.ensembl.org) [default: off]
--cache_region_size ADVANCED! The size in base-pairs of the region covered by one
file in the cache. [default: 1MB]
--buffer_size Sets the number of variants sent in each batch [default: 5000]
Increasing buffer size can retrieve results more quickly
but requires more memory. Only applies to whole genome mode.
--no_whole_genome Run in old-style, non-whole genome mode [default: off]
END
print $usage;
}