package NGS::VEP; # wrapper around Ensembl Variant Effect Predictor stand-alone script; # http://www.ensembl.org/info/docs/variation/vep/vep_script.html use Moo; use IO::All; use IO::File; use Text::CSV; use Data::Dumper; use FindBin qw($Bin); # warn $Bin; use MooX::Types::MooseLike::Base qw(HashRef ArrayRef Str); # Num use NGS::DB; # required to be passed on object contruction: has src_data => ( is => 'ro', isa => Str, required => 1 ); has form_opts => ( is => 'ro', isa => HashRef, required => 1 ); # called internally by result(): has ref_table => ( is => 'lazy' ); # _build_ref_table() has vep_input => ( is => 'lazy' ); # _build_vep_input() has errors => ( is => 'ro', isa => ArrayRef, default => sub { [] } ); # result counters: has result_counter => ( is => 'lazy' ); # _build_result_counter() returns hashref my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV # location of variable_effect_predictor script: my $vep_script = "$Bin/../script/variant_effect_predictor.pl"; my $debug = new IO::File; open ($debug, '>', './debug.txt') || die $!; $debug->autoflush(1); # to get past opening pipe to vep script use constant DEBUG => 1; # 1 (on), 0 (off) - also uses running env (requires devel) # ============================================================================== sub result { my $self = shift; # warn Dumper $self->database; # define vars ------------------------------------------------------------------ # var: $src_data # extract tab-delimited data from NGS csv data file my $src_data = $self->_text_csv_slurp(); # warn Dumper $src_data; # arrayref # var: $header_row # remove 1st row (= headers) from $src_data my $header_row = shift @$src_data; # warn Dumper $header_row; # var: $hmds_ref_numbers # get hmds sample refs from header row - in cols starting at col #6 my $hmds_ref_numbers = _get_hmds_refs($header_row); # var: $ref_table # get chromosome & start points for exons - reference table from database my $ref_table = $self->ref_table; # warn Dumper $ref_table; # var: @vep_fields # list of data items required for VEP script input file my @vep_fields = qw(chromosome start_point end_point allele strand); # var: @vep_input # pre-processed data for VEP input; list of hashrefs with keys = @vep_fields # of 'accepted' rows from src_data file my @vep_input; # var: %exon_data_ref # hash map; keys = unique_id (chromosome_start-base_variant, eg 2_123456_C/T); # vals = @vep_field vals (chr, start, end, etc); acts as ref data for merging # with VEP script output my %exon_data_ref; # var: %exon_names # hash of unique exon names - checks that all are parsed correctly and exist # in %exon_data_ref (may not if different exons produce identical unique ids) my %exon_names; # run -------------------------------------------------------------------------- # var: $row # each row in NGS csv data array, cols = exon name, base/variant, status, etc ROW: for my $row (@$src_data) { # warn Dumper $row; # skip row unless 3rd col (status field) reads 'accepted' next ROW unless lc $row->[2] eq 'accepted'; $self->result_counter->{accepted}++; # warn Dumper $self->result_counter; # var: $exon_name # sequence name in 1st col eg ASXL exon 12.7, JAK2 exon 12.1 my $exon_name = $row->[0]; # warn $exon_name; $exon_names{$exon_name}++; # var: $ref_data # reference data from database for exon name (skip row if no db entry) my $ref_data = $ref_table->{$exon_name}; # warn Dumper $ref_data; if (! $ref_data) { my $str = 'no ref table entry for ' . $exon_name; # warn $str; push @{ $self->errors }, $str; next ROW; } # var: $chromosome # chromosome number for exon, derived from reference data my $chromosome = $ref_data->{chromosome}; # var: $variant # 2nd col is colon-separated base location & variant (eg 136:A/T, 26-27:TG/--) my $variant = $row->[1]; # warn $variant; # fix for FLT3-ITD 1 variant i(278.5,TGAGATCATATTCATATTTGAGATCATATTCATATT): unless ( $variant =~ /:/ ) { my $str = sprintf 'cannot split %s variant %s', $exon_name, $variant; push @{ $self->errors }, $str; # warn $str; next ROW; } # var: $location, $allele # split base location & variant from $variant (eg 136:A/T, 26-27:TG/--) my ($location, $allele) = split ':', $variant; # warn Dumper [$location, $allele]; # var: $start_point, $end_point # define initial start & end points as reference data start position minus 1 my $start_point = my $end_point = $ref_data->{start_base} - 1; # eg 125436276 # if base location > 1 base (eg 26-27, 134-137, etc): if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27 my $span = $2 - $1; # eg 26-27 will give span = 1 $start_point += $1; # ref tbl start pos + 1st capture (26 in above example) $end_point = $start_point + $span; } else { $end_point = $start_point += $location; } # warn Dumper [ $start_point, $end_point ]; { # var: %h # hash data for vep script input file, keys = @vep_fields my %h = ( chromosome => $chromosome, start_point => $start_point, end_point => $end_point, allele => $allele, # eg T/C, A/G, etc strand => '+', # required 5th col in VEP file ); # warn Dumper \%h; push @vep_input, \%h; { # add exon name and id unique identifier for $exon_data_ref (don't # want this in vep_input); exon name + variant/location = unique ID $h{exon_name} = $exon_name; # eg "TET2 exon 4" my $exon_id = sprintf '%s [%s]', $exon_name, $variant; $h{exon_id} = $exon_id; # eg "TET2 exon 4 [891:C/T]" } # var: $unique_id # create ID matching return from VEP script, created using combination # of chromosome, start base number & allele, eg 2_12345_C/G my $unique_id = join '_', $chromosome, $start_point, $allele; $exon_data_ref{$unique_id} = \%h; { # get patient results in cols 6 onwards: my $i = 0; # reset counter to 0 for my $ref ( @$hmds_ref_numbers ) { # warn $ref; # var: $result # percentage positive result - in 1st col of each sample ref pair my $result = $row->[5 + $i]; # warn $result; # col 6, 8, 10, etc if ( $result > 0 ) { # skip zero's # var: $read_no # read number - in 2nd col of each sample ref pair my $read_no = $row->[5 + $i + 1]; # col 7, 9, 11, etc my %d = ( lab_no => $ref, allele => $allele, result => $result, read => $read_no, ); # warn Dumper \%d; # add sample data to $exon_data_ref{$unique_id} as array # (to allow same location/allele to have multiple samples): push @{ $exon_data_ref{$unique_id}{sample_data} }, \%d; }; $i += 2; # move 2 cols along } } } } # warn Dumper \%exon_data_ref; { # check all exon names are included in %exon_data_ref: my @seen = map $exon_data_ref{$_}->{exon_name}, keys %exon_data_ref; # delete all 'seen' exon names: delete @exon_names{ @seen }; # warn Dumper \@seen; # add any missing exon names to errors: push @{ $self->errors }, "$_ omitted from VEP input" for keys %exon_names; } # sort data into chromosome order to (possibly) speed up vep processing: for ( sort by_chr_location @vep_input ) { my $line = join "\t", @{$_}{@vep_fields}; # warn $line; io($self->vep_input)->append($line . "\n"); } { # send to function running VEP script: # var: $result # hashref of output data from VEP script (consequence, amino_acids, # sift, polyphen, etc), results of patient samples & non-cds rows my $result = $self->_run_vep(\%exon_data_ref); $result->{counts} = $self->result_counter; # add result counters $result->{errors} = $self->errors; io($self->vep_input)->unlink || die "cannot unlink vep_input - $!"; return $result; } } #------------------------------------------------------------------------------- sub _run_vep { my $self = shift; # var: $exon_data_ref # hash map; keys = unique_id (chromosome:start-base, eg 2:123456); vals = # @vep_field vals (chr, start, end, etc); acts as ref data for merging with # VEP script output my $exon_data_ref = shift; $self->_debug(['exon_data_ref',$exon_data_ref]); # var: $vep_input # full path to vep input file my $vep_input_filename = $self->vep_input; # warn $vep_input_filename; # var: @results # container to hold tab-delimited results array from vep output my @results; # load opts from vep.ini + user-selected options to augment: my @opts = ( "-config=$Bin/../script/vep.ini", "-i=$vep_input_filename" ); my $user_opts = $self->_get_form_options; push @opts, @$user_opts; # warn Dumper \@opts; # warn Dumper io('/tmp/data.vep')->slurp; # from test scripts # open pipe to vep script (input = $vep_file; output = stdout): my $io = io->pipe("$vep_script @opts"); # use IO::Detect; warn is_filehandle($io); while ( my $line = $io->getline ) { # warn $line; push @results, $line; } $self->_debug(['vep output',\@results]); # warn Dumper \@results; # var: @input_fields # list of data items from $exon_data_ref which are needed for adding to VEP output my @input_fields = qw(exon_id chromosome start_point end_point); # var: @vep_data # array of composite of vep input & vep output data my @vep_data; # var: %seen # running tally of vep inputs with 1 or more results my %seen; RESULT: for (@results) { # warn Dumper $_; next RESULT if /^#/; # skip comment lines my @cols = split /\t/; # warn Dumper \@cols; # var: $vep_exon_id # unique identifier - 1st col of vep output - in same format as # %$exon_data_ref key (eg 1_123456_C/T) my $vep_exon_id = $cols[0]; # warn $vep_exon_id; # var: $ref # $exon_data_ref hashref for $vep_exon_id (start, end, chr, allele, etc) if ( my $ref = $exon_data_ref->{$vep_exon_id} ) { # warn Dumper ['ref',$ref]; $seen{$vep_exon_id}++; # vep input file has at least one result # capture vep cols: consequence, amino_acids, existing_variation, extra: my $extra = $cols[13]; # contains sift, polyphen, etc results # extract sift & polyphen into separate cols: my $sift_and_polyphen = $self->_prediction_and_score($extra); # merge sift & polyphen data with allele, consequence, amino_acids, s + p: my @output_data = ( @cols[2, 3, 4, 6, 8, 9, 10, 12], @$sift_and_polyphen ); # combine vep input data (chr, start, end, exon_id) with vep output data # (sift, polyphen data, consequence, allele, etc): push @vep_data, [ @{$ref}{@input_fields}, @output_data ]; { my %h = ( consequence => $cols[6], existing => $cols[12], polyphen => $sift_and_polyphen->[1], exon_id => $ref->{exon_id}, sift => $sift_and_polyphen->[0], ); # warn Dumper \%h; { # check location infos match before merging: my $vep_location = $cols[1]; # warn $vep_location; my $ref_location = sprintf '%s:%s', $ref->{chromosome}, $ref->{start_point} != $ref->{end_point} ? $ref->{start_point}.'-'.$ref->{end_point} # bp range : $ref->{start_point}; # warn Dumper [$vep_location,$ref_location]; $vep_location eq $ref_location || die $!; } # merge %h with $exon_data_ref entry for $vep_exon_id: map $exon_data_ref->{$vep_exon_id}->{$_} = $h{$_}, keys %h; } # var: $sample_data # sample data hashref from exon_data_ref for chr_location if ( my $sample_data = $ref->{sample_data} ) { # arrayref # examines vep consequence & existing_variation cols to determine # whether to use vep result for analysis of sample_data my $is_required = _want_vep_result(\@cols); # returns 1 or 0 for (@$sample_data) { $_->{vep_row_count}++; # for vep_exon_id freq. in VEP results # flag to indicate whether to use result (multiple overwrites OK): $_->{use_vep_result} = $is_required; # 1 or 0 } } # warn Dumper $ref->{sample_data}; } else { warn "no match in src file for $_" } # no match in src file } $self->_debug(['post VEP exon_data_ref', $exon_data_ref]); # has no vep result: my @orphans = sort by_chr_location map $exon_data_ref->{$_}, grep { ! $seen{$_} } keys %$exon_data_ref; # var: $samples # process sample_data in $exon_data_ref my $samples = $self->_process_sample_results($exon_data_ref); # returns hashref return { exon_ref => $exon_data_ref, vep_data => \@vep_data, non_cds => \@orphans, samples => $samples, } } #------------------------------------------------------------------------------- sub _prediction_and_score { my ($self, $str) = @_; # warn $str; # regex to capture prediction and/or score eg SIFT=deleterious(0.02); # PolyPhen=probably_damaging; SIFT=0.002: / (\w+)? ( \(? [\d\.]+ \)? )? /x my $re = qr/ (\w+)? # optional 'prediction' (eg benign, deleterious, etc) ( # start of optional 'score' \(? # optional open bracket (only if prediction AND score selected) [\d\.]+ # any number of 0-9 & decimal point chars (eg 0.012) \)? # optional close bracket (only if prediction AND score selected) )? # end of optional 'score' /x; # need to return expected number of elements for tt: my $opts = $self->form_opts; # check for sift & polyphen my @results; # capture in order 'sift', 'polyphen' to match expected order in tt: if ( $str =~ /sift=($re)/i ) { push @results, $1; # warn $sift; } elsif ( $opts->{sift} ) { push @results, undef } # to populate td for tt if ( $str =~ /polyphen=($re)/i ) { push @results, $1; # warn $poly; } # warn Dumper \@results; elsif ( $opts->{polyphen} ) { push @results, undef } # to populate td for tt =begin # combined, but doesn't work - gets killed while ( $str =~ /(?:sift|polyphen)=($re)/ig ) { # don't capture words push @results, $1; # warn $1; } =cut map { # remove captured items within $re from $str: my $quoted = quotemeta $_; # warn $quoted; $str =~ s/(sift|polyphen)=$quoted(;?)//i; # remove trailing semi-colon } grep $_, @results; # warn $str; # !! grep $_ or empty vals cause problems push @results, $str; # what's left of it (non-sift, non-polyphen 'extra') return \@results; } #------------------------------------------------------------------------------- sub _process_sample_results { my ($self, $data) = @_; # warn Dumper $data; my %h; while ( my($location, $d) = each %$data ) { if ( my $sample_data = $d->{sample_data} ) { # warn Dumper $sample_data; # $sample_data is array(ref): for my $ref (@$sample_data) { # warn Dumper $ref; # transform $hmds_ref '/' to '_' for automatic tt sorting: my $lab_no = join '_', reverse split '/', $ref->{lab_no}; $ref->{hmds_ref} = $lab_no; # warn $lab_no; if ( $ref->{use_vep_result} ) { push @{ $h{match}{$lab_no}{$location} }, $ref; $self->result_counter->{matches}++; } else { push @{ $h{no_match}{$lab_no}{$location} }, $ref; $self->result_counter->{no_match}++; } } } } $self->_debug('sorted_sample_results', \%h); # warn Dumper \%h; return \%h; } #------------------------------------------------------------------------------- sub by_hmds_ref { my ($r1, $y1) = split '/', $a; # warn Dumper [$r1, $y1]; my ($r2, $y2) = split '/', $b; # warn Dumper [$r2, $y2]; return $y1 <=> $y2 || $r1 <=> $r2; } #------------------------------------------------------------------------------- sub _want_vep_result { # caller wants 1 or 0 return val (not undef): my $cols = shift; # arrayref my $consequence = $cols->[6]; # warn Dumper [ @{$cols}[6,12] ]; my $variation = $cols->[12]; return 1 if ( $variation =~ /COSM/ # want these regardless of consequence # reject if 'synonymous_variant', accept if variation matches /TMP|^-: || ( $variation =~ /TMP|^-/ && $consequence ne 'synonymous_variant' ) ); return 0; # don't want it # return 1 if $variation =~ /COSM/; # want these regardless of consequence # return 0 if $consequence eq 'synonymous_variant'; # don't want these # return ( $variation =~ /TMP|^-/ ); } #------------------------------------------------------------------------------- sub _get_hmds_refs { my $header = shift; # warn Dumper $header; # arrayref of hmds refs (in pairs) my $i = 0; # counter my @refs; # for cols 6 to last take the 1st col of each pair and extract lab_no: # $#{$header} = same as (scalar @array - 1) for ( @$header[5 .. $#{$header} - 1] ) { # warn Dumper [$_, $i]; next if $i++ % 2; # increment then test for modulus 2 - get every other col # warn Dumper [$_, $i]; my ($ref) = $_ =~ m!(\d+/\d{2})!; push @refs, $ref; } # warn Dumper \@refs; return \@refs; } #------------------------------------------------------------------------------- sub _get_form_options { # user submitted opts (eg sift, polyphen, etc) my $self = shift; my $params = $self->form_opts; # warn Dumper $form_opts; # default db is 'core', refseq passed by form selection if required: my $transcript_db = $params->{refseq} ? 'refseq' : 'core'; my @opts = ( '--dir=/home/raj/.vep/' . $transcript_db ); # polyphen & sift take args (s, p or b) push @opts, '--polyphen=' . $params->{polyphen} if $params->{polyphen}; push @opts, '--sift=' . $params->{sift} if $params->{sift}; # check_existing, regulatory & coding_only take no args: for ( qw/check_existing coding_only regulatory/ ) { push @opts, "--$_" if $params->{$_}; } # warn Dumper \@opts; return \@opts; } #------------------------------------------------------------------------------- sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point return ( # 1st sort on chromosome, then start_point: ( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ ) ? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X : $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22 ) || $a->{start_point} <=> $b->{start_point}; } #------------------------------------------------------------------------------- # extract tab-delimited data from csv source file: sub _text_csv_slurp { my $self = shift; my $data = $self->src_data; # warn $data; # string my $csv = Text::CSV->new(\%text_csv_args); my @results; for ( split "\n", $data ) { # warn $_; if ( $csv->parse($_) ) { my @row = $csv->fields(); push @results, \@row; } } # warn Dumper \@results; return \@results; } #------------------------------------------------------------------------------- # get reference table as hashref of gene => { chromosome, start_base }: sub _build_ref_table { my $self = shift; my $db = NGS::DB->new(); # warn $db->dbix; my $ref_tbl = $db->dbix->select('ref_seq', '*') ->map_hashes('gene'); # warn Dumper $ref_tbl; return $ref_tbl; } #------------------------------------------------------------------------------- sub _build_vep_input { my $self = shift; # create new temp file for vep input data: my $filename = $self->form_opts->{data_src}; $filename =~ s/txt\Z/vep/; # .txt -> .vep $filename =~ s/\s/_/g; # spaces -> underscores return '/tmp/'.$filename; } #------------------------------------------------------------------------------- sub _build_result_counter { my $self = shift; return { accepted => 0, matches => 0, no_match => 0, } } #------------------------------------------------------------------------------- sub _debug { # only debugging dev server output (env added in post => '/vep' sub): return 0 unless DEBUG && shift->form_opts->{environment} eq 'development'; print $debug Dumper @_; } =begin _build_ref_table() from ref.csv my $ref = "$Bin/../ref.csv"; my $io = new IO::File; open( $io, '<', $ref ) || die $!; my $csv = Text::CSV->new(\%text_csv_args); my %h; while ( my $row = $csv->getline( $io ) ) { my ($ref_seq, $chr, $start) = @$row; $h{$ref_seq} = { chromosome => $chr, start_base => $start, }; } # warn Dumper \%h; return \%h; =cut 1;