package NGS::VEP;
# wrapper around Ensembl Variant Effect Predictor stand-alone script;
# http://www.ensembl.org/info/docs/variation/vep/vep_script.html
use Moo;
use IO::All;
use IO::File;
use Text::CSV;
use Data::Dumper;
use FindBin qw($Bin); # warn $Bin;
use MooX::Types::MooseLike::Base qw(:all);
use NGS::MooX::Types qw(HashReference);
use NGS::DB;
# required to be passed on object contruction:
has src_data => ( is => 'ro', isa => Str, required => 1 );
has form_opts => ( is => 'ro', isa => HashReference, required => 1 );
# called internally by result():
has ref_table => ( is => 'lazy' ); # _build_ref_table()
my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV
# location of variable_effect_predictor script:
my $vep_script = "$Bin/../script/variant_effect_predictor.pl";
#===============================================================================
sub result {
my $self = shift; # warn Dumper $self->database;
my $data = $self->_text_csv_slurp(); # warn Dumper $data; # arrayref
# get reference table:
my $ref_table = $self->ref_table; # warn Dumper $reference;
# create vep file for variant_effect_predictor.pl
my $vep_file = "$Bin/../results.vep";
io($vep_file)->unlink if -e ($vep_file); # clear previous
my @vep_fields = qw(chromosome start_point end_point allele strand exon);
my %vep_data; # will hold ref data for merging with VEP script output
my $i; # will hold total count of 'accepted' rows
ROW: for my $row (@$data) { # cols = ref_seq, base/variant, status, ....
next ROW unless lc $row->[2] eq 'accepted'; # 3rd col is status field
$i++; # increment 'accepted' row counter
my $exon = $row->[0]; # warn $exon;
# get data from reference table, or next record:
my $ref = $ref_table->{$exon}; # warn Dumper $ref;
if (! $ref) {
warn "no ref table entry for $exon"; next ROW;
}
my $chromosome = $ref->{chromosome};
# split 2nd col to get location & variant (eg 136:A/T, 26-27:TG/--)
my ($location, $allele) = split ':', $row->[1]; # warn Dumper [$location, $allele];
# define initial start & end point as ref table start position minus 1:
my $start_point = my $end_point = $ref->{start_base} - 1;
# if location > 1 base (eg 26-27, 134-137 , etc):
if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27
my $span = $2 - $1; # eg 26-27 will give span = 1
$start_point += $1; # start location (26 in above example)
$end_point = $start_point + $span;
}
else {
$end_point = $start_point += $location;
}
{ # manipulate exon name to provide unique ID for vep input
# eg TET2 exon 4 [891], TET2 exon 4 [1031], etc:
$exon =~ s/\s/%%/g; # vep can't deal with spaces in any col
$exon .= '%%[' . $location . ']'; # exon + location gives unique ID
}
{ # data structure for vep script input file:
my %h = (
chromosome => $chromosome,
start_point => $start_point,
end_point => $end_point,
allele => $allele,
strand => '+', # required 5th col in VEP file
exon => $exon, # unique identifier eg "TET2 exon 4 [891]"
); # warn Dumper \%h;
# append to vep script input file:
my $line = join "\t", @h{@vep_fields};
io($vep_file)->append($line . "\n");
{ # data for use with vep output (use chr:start_location as unique ID):
# vep returns eg 2:12345 or 2:12345-12347 in 'Location' col
my $base_num = ( $start_point == $end_point )
? $start_point # eg 2:12345
: $start_point .'-'. $end_point; # eg 2:12345-12347
my $unique_id = join ':', $chromosome, $base_num;
$vep_data{$unique_id} = \%h;
}
}
} # warn Dumper \%vep_data;
my $result = $self->_run_vep(\%vep_data); # hashref of data & orphaned rows
$result->{row_count} = $i; # add row count of accepted rows
return $result;
}
#-------------------------------------------------------------------------------
sub _run_vep {
my ($self, $src_data) = @_; # HoH "chr:start" => (start_point, end_point, allele, etc)
my @results; # to hold tab-delimited results array from vep output
# load opts from vep.ini + user-selected options to augment:
my @opts = ( "-config=$Bin/../script/vep.ini" );
my $user_opts = $self->_get_form_options;
push @opts, @$user_opts;
# open pipe to vep script (input = $vep_file; output = stdout):
open my $fh, "-|", $vep_script, @opts;
while ( my $line = readline($fh) ) { # warn $line;
push @results, $line;
} # warn Dumper \@results;
my @input_fields = qw(exon chromosome start_point end_point allele);
my @data; # to hold composite of vep input & vep output
my %seen; # running tally of vep inputs with 1 or more results
for (@results) {
next if /^#/; # skip comment lines
my @cols = split /\t/;
my $chr_location = $cols[1]; # in same format as %$src_data key (eg 2:123456)
if ( my $ref = $src_data->{$chr_location} ) {
# capture vep cols: consequence, amino_acids, existing_variation, extra:
my @output_data = @cols[6,10,12,13];
push @data, [ @{$ref}{@input_fields}, @output_data ]; # combine data
$seen{$chr_location}++; # vep input file has at least one result
}
else { warn $_ } # no match in src file
}
# has no vep result:
my @orphans = sort by_chr_location
map $src_data->{$_}, grep { ! $seen{$_} } keys %$src_data;
return { data => \@data, orphans => \@orphans }
}
#-------------------------------------------------------------------------------
sub _get_form_options { # user submitted opts (eg sift, polyphen, etc)
my $self = shift;
my $params = $self->form_opts; # warn Dumper $form_opts;
my @opts;
# polyphen & sift take args (s, p or b)
push @opts, '--polyphen=' . $params->{polyphen} if $params->{polyphen};
push @opts, '--sift=' . $params->{sift} if $params->{sift};
# check_existing, regulatory & coding_only take no args:
for ( qw/check_existing coding_only regulatory/ ) {
push @opts, "--$_" if $params->{$_};
} # warn Dumper \@opts;
return \@opts;
}
#-------------------------------------------------------------------------------
sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point
return ( # 1st sort on chromosome, then start_point:
( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ )
? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X
: $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22
) || $a->{start_point} <=> $b->{start_point};
}
#-------------------------------------------------------------------------------
# extract tab-delimited data from csv source file:
sub _text_csv_slurp {
my $self = shift;
my $data = $self->src_data; # string
my $csv = Text::CSV->new(\%text_csv_args);
my @results;
for ( split "\n", $data ) { # warn $_;
if ( $csv->parse($_) ) {
my @row = $csv->fields();
push @results, \@row;
}
}
return \@results;
}
#-------------------------------------------------------------------------------
# get reference table as hashref of gene => { chromosome, start_base }:
sub _build_ref_table {
my $self = shift;
my $db = NGS::DB->new(); # warn $db->dbix;
my $ref_tbl = $db->dbix->select('ref_seq', '*')
->map_hashes('gene'); # warn Dumper $ref_tbl;
return $ref_tbl;
}
=begin _build_ref_table() from ref.csv
my $ref = "$Bin/../ref.csv";
my $io = new IO::File;
open( $io, '<', $ref ) || die $!;
my $csv = Text::CSV->new(\%text_csv_args);
my %h;
while ( my $row = $csv->getline( $io ) ) {
my ($ref_seq, $chr, $start) = @$row;
$h{$ref_seq} = {
chromosome => $chr,
start_base => $start,
};
} # warn Dumper \%h;
return \%h;
=cut
1;