package NGS::VEP;
# wrapper around Ensembl Variant Effect Predictor stand-alone script;
# http://www.ensembl.org/info/docs/variation/vep/vep_script.html
use Moo;
use IO::All;
use IO::File;
use Text::CSV;
use Sort::Naturally;
use Data::Dumper::Concise;
use Time::HiRes qw(gettimeofday tv_interval);
use FindBin qw($Bin); # warn $Bin;
use Modern::Perl '2012';
use NGS::MooX::Types qw(HashReference ArrayReference Object String);
use NGS::DB;
has errors => ( is => 'ro', isa => ArrayReference, default => sub { [] } );
# required to be passed on object contruction:
has form_opts => ( is => 'ro', isa => HashReference, required => 1 );
has src_data => ( is => 'ro', isa => Object, required => 1 );
has config => ( is => 'ro', isa => HashReference, required => 1 );
# called internally by result():
has ref_table => ( is => 'lazy' ); # _build_ref_table()
has vep_input => ( is => 'lazy' ); # _build_vep_input()
# result counters:
has result_counter => ( is => 'lazy' ); # _build_result_counter() returns hashref
# locations table (include/exclude known start locations):
has known_locations => ( is => 'lazy' ); # _build_known_locations() returns hashref map
my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV
# location of variable_effect_predictor script:
my $vep_script = "$Bin/../script/variant_effect_predictor.pl";
my $debug = new IO::File;
open ($debug, '>', "$Bin/../debug.txt") || die $!;
$debug->autoflush(1); # to get past opening pipe to vep script
use constant DEBUG => 1; # 1 (on), 0 (off) - also uses running env (requires devel)
# list of data items required for VEP script input file:
use constant VEP_FIELDS => qw(chromosome start_point end_point allele strand);
# ==============================================================================
sub roche_454 {
my $self = shift; # warn Dumper $self->database;
# define vars ------------------------------------------------------------------
# var: $src_data
# extract tab-delimited data from NGS csv data file
my $src_data = $self->_text_csv_slurp(); # warn Dumper $src_data; # arrayref
# var: $header_row
# remove 1st row (= headers) from $src_data
my $header_row = shift @$src_data; # warn Dumper $header_row;
# var: $hmds_ref_numbers
# get hmds sample refs from header row - in cols starting at col #6
my $hmds_ref_numbers = $self->_get_hmds_refs($header_row);
# var: $ref_table
# get chromosome & start points for exons - reference table from database
my $ref_table = $self->ref_table; # warn Dumper $ref_table;
# var: @vep_input
# pre-processed data for VEP input; list of hashrefs with keys = VEP_FIELDS
# of 'accepted' rows from src_data file
my @vep_input;
# var: %exon_data_ref
# hash map; keys = unique_id (chromosome_start-base_variant, eg 2_123456_C/T);
# vals = @vep_field vals (chr, start, end, etc); acts as ref data for merging
# with VEP script output
my %exon_data_ref;
# var: %exon_names
# hash of unique exon names - checks that all are parsed correctly and exist
# in %exon_data_ref (may not if different exons produce identical unique ids)
my %exon_names;
# run --------------------------------------------------------------------------
# var: $row
# each row in NGS csv data array, cols = exon name, base/variant, status, etc
ROW: for my $row (@$src_data) { # warn Dumper $row;
$self->result_counter->{src_rows}++; # warn Dumper $self->result_counter;
# skip row unless 3rd col (status field) reads 'accepted'
next ROW unless lc $row->[2] eq 'accepted';
$self->result_counter->{accepted_src_rows}++; # warn Dumper $self->result_counter;
# var: $exon_name
# sequence name in 1st col eg ASXL exon 12.7, JAK2 exon 12.1
my $exon_name = $row->[0]; # warn $exon_name;
$exon_names{$exon_name}++;
# var: $ref_data
# reference data from database for exon name (skip row if no db entry)
my $ref_data = $ref_table->{$exon_name}; # warn Dumper $ref_data;
if (! $ref_data) {
my $str = 'no ref table entry for ' . $exon_name;
$self->_add_to_errors($str); # warn $str;
next ROW;
}
# var: $chromosome
# chromosome number for exon, derived from reference data
my $chromosome = $ref_data->{chromosome};
# var: $variant
# 2nd col is colon-separated base location & variant (eg 136:A/T, 26-27:TG/--)
my $variant = $row->[1]; # warn $variant;
# fix for FLT3-ITD 1 variant i(278.5,TGAGATCATATTCATATTTGAGATCATATTCATATT):
unless ( $variant =~ /:/ ) {
my $str = sprintf 'cannot split %s variant %s', $exon_name, $variant;
$self->_add_to_errors($str); # warn $str;
next ROW;
}
# var: $location, $allele
# split base location & variant from $variant (eg 136:A/T, 26-27:TG/--)
my ($location, $allele) = split ':', $variant; # warn Dumper [$location, $allele];
# var: $start_point, $end_point
# define initial start & end points as reference data start position minus 1
my $start_point = my $end_point = $ref_data->{start_base} - 1; # eg 125436276
# if base location > 1 base (eg 26-27, 134-137, etc):
if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27
my $span = $2 - $1; # eg 26-27 will give span = 1
$start_point += $1; # ref tbl start pos + 1st capture (26 in above example)
$end_point = $start_point + $span;
}
else {
$end_point = $start_point += $location;
} # warn Dumper [ $start_point, $end_point ];
{
# var: %h
# hash data for vep script input file, keys = VEP_FIELDS
my %h = (
chromosome => $chromosome,
start_point => $start_point,
end_point => $end_point,
allele => $allele, # eg T/C, A/G, etc
strand => '+', # required 5th col in VEP file
); # warn Dumper \%h;
push @vep_input, \%h;
{ # add exon name and id unique identifier for $exon_data_ref (don't
# want this in vep_input); exon name + variant/location = unique ID
$h{exon_name} = $exon_name; # eg "TET2 exon 4"
my $exon_id = sprintf '%s [%s]', $exon_name, $variant;
$h{exon_id} = $exon_id; # eg "TET2 exon 4 [891:C/T]"
}
# var: $unique_id
# create ID matching return from VEP script, created using combination
# of chromosome, start base number & allele, eg 2_12345_C/G
my $unique_id = join '_', $chromosome, $start_point, $allele;
$exon_data_ref{$unique_id} = \%h;
{ # get patient results in cols 6 onwards:
my $i = 0; # reset counter to 0
for my $ref ( @$hmds_ref_numbers ) { # warn $ref;
# var: $result
# percentage positive result - in 1st col of each sample ref pair
my $result = $row->[5 + $i]; # warn $result; # col 6, 8, 10, etc
if ( $result > 0 ) { # skip zero's
# var: $read_no
# read number - in 2nd col of each sample ref pair
my $read_no = $row->[5 + $i + 1]; # col 7, 9, 11, etc
my %d = (
lab_no => $ref,
allele => $allele,
result => $result,
read => $read_no,
); # warn Dumper \%d;
# add sample data to $exon_data_ref{$unique_id} as array
# (to allow same location/allele to have multiple samples):
push @{ $exon_data_ref{$unique_id}{sample_data} }, \%d;
};
$i += 2; # move 2 cols along
}
}
}
} # warn Dumper \%exon_data_ref;
$self->result_counter->{vep_input_rows} = scalar @vep_input;
{ # check all exon names are included in %exon_data_ref:
my @seen = map $exon_data_ref{$_}->{exon_name}, keys %exon_data_ref;
# delete all 'seen' exon names:
delete @exon_names{ @seen }; # warn Dumper \@seen;
# add any missing exon names to errors:
$self->_add_to_errors("$_ omitted from VEP input") for keys %exon_names;
}
# sort data into chromosome order to (possibly) speed up vep processing:
my @vep_fields = VEP_FIELDS;
for ( sort by_chr_location @vep_input ) {
my $line = join "\t", @{$_}{@vep_fields}; # warn $line;
io($self->vep_input)->append($line . "\n");
}
# send to function running VEP script:
# var: $result
# hashref of output data from VEP script (consequence, amino_acids,
# sift, polyphen, etc), results of patient samples & non-cds rows
my $result = $self->_run_vep(\%exon_data_ref);
{ # add counts, errors, etc:
my %h = (
counts => $self->result_counter, # add result counters
errors => $self->errors,
);
$result->{data} = \%h;
}
io($self->vep_input)->unlink || die "cannot unlink vep_input - $!";
return $result;
}
#-------------------------------------------------------------------------------
sub miseq_vcf {
my $self = shift;
my $args = shift; # href outdir = str, files = array(ref)
my $files = $args->{files};
my $tmp_dir = $args->{outdir}; # location of extracted files in /tmp
# miseq run id = sub-directory name in /tmp dir (remove leading /tmp/):
# my $miseq_id = substr($tmp_dir, 5); # warn $miseq_id; # not using
my @vep_fields = VEP_FIELDS;
# var: %exon_data_ref
# hash map; keys = unique_id (chromosome_start-base_variant, eg 2_123456_C/T);
# vals = @vep_field vals (chr, start, end, etc); acts as ref data for merging
# with VEP script output
my %exon_data_ref = ();
my @vep = (); # reset array - to hold rows for vep input file
my %seen; # to ensure unique gene/chromosome/start_point/allele combinations in vep
FILE: for (@$files) { # warn $_;
my $io = io($_);
my $file = $io->filename; # warn $file;
next FILE unless $file =~ /vcf\Z/; # skip unless .vcf suffix
# my $sample_id = join '/', $miseq_id, $file; # warn $sample_id; # not using
my $lab_number = join '/', $file =~ /^[A-Z]?(\d+)-(\d+)/; # warn $lab_number;
my $data = $self->_parse_file($io); # warn Dumper $data; # AoH
ROW: for my $row (@$data) { # warn Dumper $row; # hashref
# create unique id:
my $unique_id = join '_', @{$row}{qw(chromosome start_point allele)};
# use gene name + unique_id for exon_id (exon_id not supplied by miseq):
my $exon_id_alias = join '_', $row->{gene}, $unique_id;
$row->{exon_id} = $exon_id_alias; # warn $exon_id_alias;
$exon_data_ref{$unique_id} ||= $row;
# add sample data to $exon_data_ref:
my %d = (
lab_no => $lab_number,
allele => $row->{allele},
result => $row->{vf},
read => $row->{depth},
); # warn Dumper \%d;
# add data as array(ref) to allow same location/allele to have multiple samples:
push @{ $exon_data_ref{$unique_id}{sample_data} }, \%d;
# only need exon_id once in vep input:
next ROW if $seen{$exon_id_alias}++;
push @vep, [ @{$row}{@vep_fields} ];
}
}
$self->result_counter->{vep_input_rows} = scalar @vep;
$self->_debug(exon_data_ref => \%exon_data_ref); # warn Dumper \%seen;
# send to function running VEP script:
# write out vars as tab-delimited row to @vep, Sort::Naturally by chr:
for my $ref ( sort by_chromosome_and_start_pos @vep ) {
my $row = join "\t", @$ref;
io($self->vep_input)->append($row . "\n");
}
# var: $result
# hashref of output data from VEP script (consequence, amino_acids,
# sift, polyphen, etc), results of patient samples & non-cds rows
my $result = $self->_run_vep(\%exon_data_ref);
{ # add counts, errors, etc:
my %h = (
variants => \%seen, # unique exon_id_aliases
total => scalar @$files,
counts => $self->result_counter, # add result counters
errors => $self->errors,
);
$result->{data} = \%h;
}
{ # delete tmp files:
io($self->vep_input)->unlink || die "cannot unlink vep_input - $!";
io($_)->unlink for @$files;
io($tmp_dir)->rmdir || die "cannot unlink $tmp_dir = $!";
}
return $result;
}
#-------------------------------------------------------------------------------
sub _parse_file {
my ($self, $f) = @_; # io object
my @rows = io($f->name)->slurp; # warn Dumper \@rows;
my @data = (); # for AoH
ROW: for (@rows) {
next ROW if /^#/; # skip comments
$self->result_counter->{src_rows}++;
my @fields = split "\t", $_;
my $vals = $self->_parse_fields(\@fields) || next ROW; # $vals = hashref
push @data, $vals;
}
$self->result_counter->{accepted_src_rows} += scalar @data;
return \@data;
}
#-------------------------------------------------------------------------------
sub _parse_fields {
my $self = shift;
my $data = shift; # arrayref of tab-separated vcf row entries
return 0 unless $data->[0] =~ /^chr/; # or rest of regexes will fail
my $chr = $data->[0]; # warn $chr; # eg chrX, chr2, etc
my $pos = $data->[1]; # warn $pos; # numerical position
my $ref = $data->[3]; # warn $ref; # str of 1 or more A,C,T,G bases
my $alt = $data->[4]; # warn $alt; # str of 1 or more A,C,T,G bases
my $info = $data->[7]; # warn $info # DP, TI, GI, FC, GT, etc values
my $results = $data->[9]; # warn $results; # colon-separated results (eg VF)
my $vf = (split ':', $results)[3] || warn 'no VF info'; # warn $vf;
my $known_locations_map = $self->known_locations;
my ($depth) = $info =~ /DP=(\d+)/; # warn $depth;
my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match
my $allele = undef; # defined in block below
# rules: exclude if start position matches exclusion list, depth < 100 or
# VF < 5% (unless start position matches inclusion list):
unless ( $known_locations_map->{include}->{$pos} ) {
return 0 if
$vf < 0.05 || $depth < 100 || $known_locations_map->{exclude}->{$pos}
}
# define initial start & end positions = vcf 'position' var (2nd col):
my $start_pos = my $end_pos = $pos;
# define allele and refine start & end positions depending on ins/del/sub:
if ( length $alt > 1 ) { # warn 'insertion'; # base insertion:
$start_pos += 1; # end pos stays same
$allele = sprintf '-/%s', substr($alt, 1); # alt minus 1st base
}
elsif ( length $ref > 1 ) { # warn 'deletion'; base deletion:
$start_pos += 1;
$end_pos += length($ref) - 1;
$allele = sprintf '%s/-', substr($ref, 1); # ref minus 1st base
}
else { # warn 'substitution'; # alt & ref both single ie base substitution:
$allele = sprintf '%s/%s', $ref, $alt; # warn $allele;
}
my @genes = do { # remove duplicate genes: eg TP53,TP53,TP53
no warnings 'uninitialized'; # eg if regex failed due to malformed str
my %seen = map { $_ => 1 } split ',', $genes;
keys %seen;
}; # warn @genes;
my $gene = join ', ', @genes; # || warn 'no GI entry';
my %h = (
chromosome => substr($chr, 3), # remove leading 'chr' chars
start_point => $start_pos,
end_point => $end_pos,
allele => $allele,
depth => $depth,
gene => $gene,
vf => $vf * 100, # convert proportion to percent
); # warn Dumper \%h;
=begin # not using gene_strand table - all strands '+'
GENE: for my $gene (@genes) {
# + or - strand from gene_strand table:
my $strand = $gene_strand->{$gene};
unless ($strand) {
warn "$gene not in gene_strand table";
next GENE;
} # warn Dumper [ $chromosome, $start_pos, $end_pos, $allele, $strand ];
$h{strand} = $strand;
}
=cut
$h{strand} = '+'; # not using gene_strand table
return \%h;
}
#-------------------------------------------------------------------------------
sub _run_vep {
my $self = shift;
# var: $exon_data_ref
# hash map; keys = unique_id (chromosome:start-base, eg 2:123456); vals =
# @vep_field vals (chr, start, end, etc); acts as ref data for merging with
# VEP script output
my $exon_data_ref = shift; $self->_debug(['exon_data_ref',$exon_data_ref]);
# var: $vep_input
# full path to vep input file
my $vep_input_filename = $self->vep_input; # warn $vep_input_filename;
# var: @results
# container to hold tab-delimited results array from vep output
my @results;
# load opts from vep.ini + user-selected options to augment:
my @opts = ( "-config=$Bin/../script/vep.ini", "-i=$vep_input_filename" );
{ # enabled forking on deployment server (actually slower on dev; see 'top'):
my $forks = $self->config->{fork}; # warn Dumper $forks;
push @opts, "--fork=$forks" if $forks; # development.ini = 0
} # warn Dumper \@opts;
my $user_opts = $self->_get_form_options;
push @opts, @$user_opts; # warn Dumper \@opts;
# warn Dumper io('/tmp/data.vep')->slurp; # from test scripts
# open pipe to vep script (input = $vep_file; output = stdout):
my $io = io->pipe("$vep_script @opts"); # use IO::Detect; warn is_filehandle($io);
# start vep input ==============================================================
my $t0 = [gettimeofday]; # set t0
while ( my $line = $io->getline ) { # warn $line;
push @results, $line;
} $self->_debug(['vep output',\@results]); # warn Dumper \@results;
$self->result_counter->{runtime} = tv_interval $t0, [gettimeofday]; # script runtime
# end vep input ================================================================
# var: @input_fields
# list of data items from $exon_data_ref which are needed for adding to VEP output
my @input_fields = qw(exon_id chromosome start_point end_point);
# var: @vep_data
# array of composite of vep input & vep output data
my @vep_data;
# var: %seen
# running tally of vep inputs with 1 or more results
my %seen;
RESULT: for (@results) { # warn Dumper $_;
next RESULT if /^#/; # skip comment lines
my @cols = split /\t/; # warn Dumper \@cols;
# var: $vep_exon_id
# unique identifier - 1st col of vep output - in same format as
# %$exon_data_ref key (eg 1_123456_C/T)
my $vep_exon_id = $cols[0]; # warn $vep_exon_id;
# var: $ref
# $exon_data_ref hashref for $vep_exon_id (start, end, chr, allele, etc)
if ( my $ref = $exon_data_ref->{$vep_exon_id} ) { # warn Dumper ['ref',$ref];
$seen{$vep_exon_id}++; # vep input file has at least one result
# capture vep cols: consequence, amino_acids, existing_variation, extra:
my $extra = $cols[13]; # contains sift, polyphen, etc results
# extract sift & polyphen into separate cols:
my $sift_and_polyphen = $self->_prediction_and_score($extra);
# merge sift & polyphen data with allele, consequence, amino_acids, s + p:
my @output_data = ( @cols[2, 3, 4, 6, 8, 9, 10, 12], @$sift_and_polyphen );
# combine vep input data (chr, start, end, exon_id) with vep output data
# (sift, polyphen data, consequence, allele, etc):
push @vep_data, [ @{$ref}{@input_fields}, @output_data ];
{
my %h = (
consequence => $cols[6],
existing => $cols[12],
polyphen => $sift_and_polyphen->[1],
exon_id => $ref->{exon_id},
sift => $sift_and_polyphen->[0],
); # warn Dumper \%h;
{ # check location infos match before merging:
my $vep_location = $cols[1]; # warn $vep_location;
my ($start, $end) = @{$ref}{ qw/start_point end_point/ };
my $loc = do {
if ( $end < $start ) { join '-', $end, $start } # insertion
elsif ( $end > $start ) { join '-', $start, $end } # deletion
else { $start } # substitution
}; # warn $loc;
my $ref_location = sprintf '%s:%s', $ref->{chromosome}, $loc;
# warn Dumper [$vep_location,$ref_location];
unless ( $vep_location eq $ref_location ) {
my $str = sprintf 'vep location (%s) != ref_location (%s)',
$vep_location, $ref_location;
$self->_add_to_errors($str); # warn $str;
}
}
# merge %h with $exon_data_ref entry for $vep_exon_id:
map $exon_data_ref->{$vep_exon_id}->{$_} = $h{$_}, keys %h;
}
# var: $sample_data
# sample data hashref from exon_data_ref for chr_location
if ( my $sample_data = $ref->{sample_data} ) { # arrayref
# examines vep consequence & existing_variation cols to determine
# whether to use vep result for analysis of sample_data
my $is_required = _want_vep_result(\@cols); # returns 1 or 0
for (@$sample_data) {
$_->{vep_row_count}++; # for vep_exon_id freq. in VEP results
# flag to indicate whether to use result (multiple overwrites OK):
$_->{use_vep_result} = $is_required; # 1 or 0
}
} # warn Dumper $ref->{sample_data};
}
else { warn "no match in src file for $_" } # no match in src file
} $self->_debug(['post VEP exon_data_ref', $exon_data_ref]);
# has no vep result:
my @orphans = sort by_chr_location
map $exon_data_ref->{$_}, grep { ! $seen{$_} } keys %$exon_data_ref;
# var: $samples
# process sample_data in $exon_data_ref
my $samples = $self->_process_sample_results($exon_data_ref); # returns hashref
return {
exon_ref => $exon_data_ref,
vep_data => \@vep_data,
non_cds => \@orphans,
samples => $samples,
}
}
#-------------------------------------------------------------------------------
sub _prediction_and_score {
my ($self, $str) = @_; # warn $str;
# regex to capture prediction and/or score eg SIFT=deleterious(0.02);
# PolyPhen=probably_damaging; SIFT=0.002: / (\w+)? ( \(? [\d\.]+ \)? )? /x
my $re = qr/
(\w+)? # optional 'prediction' (eg benign, deleterious, etc)
( # start of optional 'score'
\(? # optional open bracket (only if prediction AND score selected)
[\d\.]+ # any number of 0-9 & decimal point chars (eg 0.012)
\)? # optional close bracket (only if prediction AND score selected)
)? # end of optional 'score'
/x;
# need to return expected number of elements for tt:
my $opts = $self->form_opts; # check for sift & polyphen
my @results; # capture in order 'sift', 'polyphen' to match expected order in tt:
if ( $str =~ /sift=($re)/i ) {
push @results, $1; # warn $sift;
}
elsif ( $opts->{sift} ) { push @results, undef } # to populate td for tt
if ( $str =~ /polyphen=($re)/i ) {
push @results, $1; # warn $poly;
} # warn Dumper \@results;
elsif ( $opts->{polyphen} ) { push @results, undef } # to populate td for tt
=begin # combined, but doesn't work - gets killed
while ( $str =~ /(?:sift|polyphen)=($re)/ig ) { # don't capture words
push @results, $1; # warn $1;
}
=cut
map { # remove captured items within $re from $str:
my $quoted = quotemeta $_; # warn $quoted;
$str =~ s/(sift|polyphen)=$quoted(;?)//i; # remove trailing semi-colon
} grep $_, @results; # warn $str; # !! grep $_ or empty vals cause problems
push @results, $str; # what's left of it (non-sift, non-polyphen 'extra')
return \@results;
}
#-------------------------------------------------------------------------------
sub _process_sample_results {
my ($self, $data) = @_; # warn Dumper $data;
my %h;
while ( my($location, $d) = each %$data ) {
if ( my $sample_data = $d->{sample_data} ) { # warn Dumper $sample_data;
# $sample_data is array(ref):
for my $ref (@$sample_data) { # warn Dumper $ref;
# transform $hmds_ref '/' to '_' for automatic tt sorting:
my $lab_no = join '_', reverse split '/', $ref->{lab_no};
$ref->{hmds_ref} = $lab_no; # warn $lab_no;
if ( $ref->{use_vep_result} ) {
push @{ $h{match}{$lab_no}{$location} }, $ref;
$self->result_counter->{matches}++;
}
else {
push @{ $h{no_match}{$lab_no}{$location} }, $ref;
$self->result_counter->{no_match}++;
}
}
}
} $self->_debug('sorted_sample_results', \%h); # warn Dumper \%h;
return \%h;
}
#-------------------------------------------------------------------------------
sub by_hmds_ref {
my ($r1, $y1) = split '/', $a; # warn Dumper [$r1, $y1];
my ($r2, $y2) = split '/', $b; # warn Dumper [$r2, $y2];
return $y1 <=> $y2 || $r1 <=> $r2;
}
#-------------------------------------------------------------------------------
sub _want_vep_result { # caller wants 1 or 0 return val (not undef):
my $cols = shift; # arrayref
my $consequence = $cols->[6]; # warn Dumper [ @{$cols}[6,12] ];
my $variation = $cols->[12];
return 1 if (
$variation =~ /COSM/ # want these regardless of consequence
# reject if 'synonymous_variant', accept if variation matches /TMP|^-:
|| ( $variation =~ /TMP|^-/ && $consequence ne 'synonymous_variant' )
);
return 0; # don't want it
# return 1 if $variation =~ /COSM/; # want these regardless of consequence
# return 0 if $consequence eq 'synonymous_variant'; # don't want these
# return ( $variation =~ /TMP|^-/ );
}
#-------------------------------------------------------------------------------
sub _get_hmds_refs {
my ($self, $header) = @_; # warn Dumper $header; # arrayref of hmds refs (in pairs)
my $i = 0; # counter
my @refs;
# for cols 6 to last take the 1st col of each pair and extract lab_no:
# $#{$header} = same as (scalar @array - 1)
for ( @$header[5 .. $#{$header} - 1] ) { # warn Dumper [$_, $i];
next if $i++ % 2; # increment then test for modulus 2 - get every other col
if ( my ($ref) = $_ =~ m!(\d+/\d{2})! ) {
push @refs, $ref;
}
else {
my $str = sprintf q!cannot find lab number in col header '%s'!, $_;
$self->_add_to_errors($str);
}
} # warn Dumper \@refs;
return \@refs;
}
#-------------------------------------------------------------------------------
sub _get_form_options { # user submitted opts (eg sift, polyphen, etc)
my $self = shift;
my $params = $self->form_opts; # warn Dumper $form_opts;
# default db is 'core', refseq passed by form selection if required:
my $transcript_db = $params->{refseq} ? 'refseq' : 'core';
my @opts = ( '--dir=/home/raj/.vep/' . $transcript_db );
# polyphen & sift take args (s, p or b)
push @opts, '--polyphen=' . $params->{polyphen} if $params->{polyphen};
push @opts, '--sift=' . $params->{sift} if $params->{sift};
# check_existing, regulatory & coding_only take no args:
for ( qw/check_existing coding_only regulatory/ ) {
push @opts, "--$_" if $params->{$_};
} # warn Dumper \@opts;
return \@opts;
}
#-------------------------------------------------------------------------------
sub _add_to_errors {
my ($self, $str) = @_;
push @{ $self->errors }, $str;
}
#-------------------------------------------------------------------------------
sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point
return ( # 1st sort on chromosome, then start_point:
( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ )
? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X
: $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22
) || $a->{start_point} <=> $b->{start_point};
}
#-------------------------------------------------------------------------------
sub by_chromosome_and_start_pos { # Sort::Naturally ncmp function - same as above:
return ncmp($a->[0], $b->[0]) || ncmp($a->[1], $b->[1]);
}
#-------------------------------------------------------------------------------
# extract tab-delimited data from csv source file:
sub _text_csv_slurp {
my $self = shift;
my $data = $self->src_data->content; # warn $data; # string
my $csv = Text::CSV->new(\%text_csv_args);
my @results;
for ( split "\n", $data ) { # warn $_;
if ( $csv->parse($_) ) {
my @row = $csv->fields();
push @results, \@row;
}
} # warn Dumper \@results;
return \@results;
}
#-------------------------------------------------------------------------------
# get reference table as hashref of gene => { chromosome, start_base }:
sub _build_ref_table {
my $self = shift;
my $db = NGS::DB->new(); # warn $db->dbix;
my $ref_tbl = $db->dbix->select('ref_seq', '*')
->map_hashes('gene'); # warn Dumper $ref_tbl;
return $ref_tbl;
}
#-------------------------------------------------------------------------------
sub _build_known_locations {
my $self = shift;
my $db = NGS::DB->new(); # warn $db->dbix;
my $r = $db->dbix->select('locations', ['position', 'action']);
my %h;
while ( $r->into( my ($position, $action) ) ) {
$h{$action}{$position} = 1;
} # warn Dumper \%h;
return \%h;
}
#-------------------------------------------------------------------------------
sub _build_vep_input {
my $self = shift;
# create new temp file for vep input data:
my $filename = $self->form_opts->{data_src};
$filename =~ s/(txt|zip)\Z/vep/; # .txt -> .vep
$filename =~ s/\s/_/g; # spaces -> underscores
my $path_to_file = '/tmp/'.$filename;
# delete if already exists (eg due to aborted previous run) as using append mode:
io($path_to_file)->unlink if -e $path_to_file;
return $path_to_file;
}
#-------------------------------------------------------------------------------
sub _build_result_counter {
my $self = shift;
my %h = map +($_ => 0),
qw( matches no_match vep_input_rows src_rows accepted_src_rows runtime );
return \%h;
}
#-------------------------------------------------------------------------------
sub _debug { # only debugging dev server output (env added in post => '/vep' sub):
return 0 unless DEBUG && shift->form_opts->{environment} eq 'development';
print $debug Dumper @_;
}
=begin _build_ref_table() from ref.csv
my $ref = "$Bin/../ref.csv";
my $io = new IO::File;
open( $io, '<', $ref ) || die $!;
my $csv = Text::CSV->new(\%text_csv_args);
my %h;
while ( my $row = $csv->getline( $io ) ) {
my ($ref_seq, $chr, $start) = @$row;
$h{$ref_seq} = {
chromosome => $chr,
start_base => $start,
};
} # warn Dumper \%h;
return \%h;
=cut
1;