package NGS::VEP; # wrapper around Ensembl Variant Effect Predictor stand-alone script; # http://www.ensembl.org/info/docs/variation/vep/vep_script.html use Moo; use IO::All; use IO::File; use Text::CSV; use Data::Dumper; use FindBin qw($Bin); # warn $Bin; use MooX::Types::MooseLike::Base qw(:all); use NGS::MooX::Types qw(HashReference); use NGS::DB; # required to be passed on object contruction: has src_data => ( is => 'ro', isa => Str, required => 1 ); has form_opts => ( is => 'ro', isa => HashReference, required => 1 ); # called internally by result(): has ref_table => ( is => 'lazy' ); # _build_ref_table() my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV # location of variable_effect_predictor script: my $vep_script = "$Bin/../script/variant_effect_predictor.pl"; #=============================================================================== sub result { my $self = shift; # warn Dumper $self->database; my $data = $self->_text_csv_slurp(); # warn Dumper $data; # arrayref # remove 1st row (headers): my $header_row = shift @$data; # warn Dumper $header_row; # get hmds refs - in cols starting at col #6: my $hmds_refs = _get_hmds_refs($header_row); # get reference table (chromosome start & end points for sequences): my $ref_table = $self->ref_table; # warn Dumper $ref_table; # create vep file for variant_effect_predictor.pl my $vep_file = "$Bin/../results.vep"; io($vep_file)->unlink if -e ($vep_file); # clear previous my @vep_fields = qw(chromosome start_point end_point allele strand exon); my @vep_input; # will hold pre-processed data = VEP script input my %vep_data; # will hold ref data for merging with VEP script output my %hmds_ref; # will hold read data (% positive and read number) for samples my $i; # will hold total count of 'accepted' rows ROW: for my $row (@$data) { # cols = exon name, base/variant, status, .... next ROW unless lc $row->[2] eq 'accepted'; # 3rd col is status field $i++; # increment 'accepted' row counter my $exon = $row->[0]; # warn $exon; # get data from reference table, or skip record: my $ref = $ref_table->{$exon}; # warn Dumper $ref; if (! $ref) { warn "no ref table entry for $exon"; next ROW; } my $chromosome = $ref->{chromosome}; # split 2nd col to get location & variant (eg 136:A/T, 26-27:TG/--) my ($location, $allele) = split ':', $row->[1]; # warn Dumper [$location, $allele]; # define initial start & end point as ref table start position minus 1: my $start_point = my $end_point = $ref->{start_base} - 1; # eg 125436276 # if location > 1 base (eg 26-27, 134-137, etc): if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27 my $span = $2 - $1; # eg 26-27 will give span = 1 $start_point += $1; # ref tbl start pos + 1st capture (26 in above example) $end_point = $start_point + $span; } else { $end_point = $start_point += $location; } { # manipulate exon name to provide unique ID for vep input # eg TET2 exon 4 [891], TET2 exon 4 [1031], etc: $exon =~ s/\s/%%/g; # vep can't deal with spaces in any col $exon .= '%%[' . $location . ']'; # exon + location gives unique ID } { # data structure for vep script input file: my %h = ( chromosome => $chromosome, start_point => $start_point, end_point => $end_point, allele => $allele, # eg T/C, A/G, etc strand => '+', # required 5th col in VEP file exon => $exon, # unique identifier eg "TET2 exon 4 [891]" ); # warn Dumper \%h; push @vep_input, \%h; { # data for use with vep output (use chr:start_location as unique ID): # vep returns eg 2:12345 or 2:12345-12347 in 'Location' col my $base_num = ( $start_point == $end_point ) ? $start_point # eg 2:12345 : $start_point .'-'. $end_point; # eg 2:12345-12347 my $unique_id = join ':', $chromosome, $base_num; $vep_data{$unique_id} = \%h; # get patient results in cols 6 onwards: my $n = 0; # reset counter to 0 for my $ref ( @$hmds_refs ) { # warn $ref; # result in 1st col of pair, read number in 2nd: my $result = $row->[5 + $n]; if ( $result > 0 ) { # skip zero's my $read_no = $row->[5 + $n + 1]; # next col # add to hmds ref $hmds_ref{$ref}{$unique_id} = { result => $result, read => $read_no, }; } $n += 2; # move 2 cols along } } } } # warn Dumper \%vep_data; # warn Dumper \%hmds_ref; # sort data into chromosome order to (possibly) speed up vep processing: for ( sort by_chr_location @vep_input ) { my $line = join "\t", @{$_}{@vep_fields}; io($vep_file)->append($line . "\n"); } { # send to function running VEP script: my %data = ( vep_data => \%vep_data, sample_data => \%hmds_ref ); my $result = $self->_run_vep(\%data); # hashref of data & orphaned rows $result->{row_count} = $i; # add row count of accepted rows return $result; } } #------------------------------------------------------------------------------- sub _run_vep { my ($self, $data) = @_; my $src_data = $data->{vep_data}; # "chr:start" => (start_point, end_point, allele, etc) my $hmds_ref = $data->{sample_data}; # hmds_ref => { unique_id => { read results } } my @results; # to hold tab-delimited results array from vep output # load opts from vep.ini + user-selected options to augment: my @opts = ( "-config=$Bin/../script/vep.ini" ); my $user_opts = $self->_get_form_options; push @opts, @$user_opts; # open pipe to vep script (input = $vep_file; output = stdout): open my $fh, "-|", $vep_script, @opts; while ( my $line = readline($fh) ) { # warn $line; push @results, $line; } # warn Dumper \@results; my @input_fields = qw(exon chromosome start_point end_point allele); my @data; # to hold composite of vep input & vep output my %seen; # running tally of vep inputs with 1 or more results for (@results) { next if /^#/; # skip comment lines my @cols = split /\t/; my $chr_location = $cols[1]; # in same format as %$src_data key (eg 2:123456) if ( my $ref = $src_data->{$chr_location} ) { # warn Dumper $ref; # capture vep cols: consequence, amino_acids, existing_variation, extra: my $extra = $cols[13]; # contains sift, polyphen, etc results # extract sift & polyphen into separate cols: my $results = $self->_prediction_and_score($extra); my @output_data = ( @cols[6,10,12], @$results ); push @data, [ @{$ref}{@input_fields}, @output_data ]; # combine data $seen{$chr_location}++; # vep input file has at least one result # add vep data to $hmds_ref data if it's useful: if ( _want_vep_result(\@cols) ) { # examines consequence & existing cols REF: while ( my ($hmds, $d) = each %$hmds_ref ) { ID: while ( my ($unique_id, $result) = each %$d ) { next ID unless $unique_id eq $chr_location; my %h = ( consequence => $cols[6], existing => $cols[12], polyphen => $results->[1], sift => $results->[0], exon => $ref->{exon}, ); # merge with $result: map $result->{$_} = $h{$_}, keys %h; } } } } else { warn "no match in src file for $_" } # no match in src file } # has no vep result: my @orphans = sort by_chr_location map $src_data->{$_}, grep { ! $seen{$_} } keys %$src_data; my $samples = _sort_sample_results($hmds_ref); # hashref return { data => \@data, orphans => \@orphans, samples => $samples } } #------------------------------------------------------------------------------- sub _prediction_and_score { my ($self, $str) = @_; # warn $str; # regex to capture prediction and/or score eg SIFT=deleterious(0.02); # PolyPhen=probably_damaging; SIFT=0.002: / (\w+)? ( \(? [\d\.]+ \)? )? /x my $re = qr/ (\w+)? # optional 'prediction' (eg benign, deleterious, etc) ( # start of optional 'score' \(? # optional open bracket (only if prediction AND score selected) [\d\.]+ # any number of 0-9 & decimal point chars (eg 0.012) \)? # optional close bracket (only if prediction AND score selected) )? # end of optional 'score' /x; my @results; # capture in order 'sift', 'polyphen' to match expected order in tt: if ( $str =~ /sift=($re)/i ) { push @results, $1; # warn $sift; } if ( $str =~ /polyphen=($re)/i ) { push @results, $1; # warn $poly; } =begin # combined, but doesn't work - gets killed while ( $str =~ /(?:sift|polyphen)=($re)/ig ) { # don't capture words push @results, $1; # warn $1; } =cut map { # remove captured items within $re from $str: my $quoted = quotemeta $_; $str =~ s/(sift|polyphen)=$quoted(;?)//i; # remove trailing semi-colon } @results; # warn $str; push @results, $str; # what's left of it (non-sift, non-polyphen 'extra') return \@results; } #------------------------------------------------------------------------------- sub _sort_sample_results { my $data = shift; # hashref of hmds_ref => { chr:location => data } my %h; REF: while ( my ($hmds_ref, $d ) = each %$data ) { # warn Dumper $d; LOC: while ( my ($location, $results) = each %$d ) { my $exon_name = $results->{exon} || next LOC; # transform $hmds_ref slash to underscore for automatic tt sorting: my $lab_no = join '_', reverse split '/', $hmds_ref; # warn $lab_no; # add hmds_ref to $result: $results->{hmds_ref} = $hmds_ref; $h{$lab_no}{$exon_name} = $results; } } # warn Dumper \%h; return \%h; } #------------------------------------------------------------------------------- sub by_hmds_ref { my ($r1, $y1) = split '/', $a; # warn Dumper [$r1, $y1]; my ($r2, $y2) = split '/', $b; # warn Dumper [$r2, $y2]; return $y1 <=> $y2 || $r1 <=> $r2; } #------------------------------------------------------------------------------- sub _want_vep_result { my $cols = shift; # arrayref my $consequence = $cols->[6]; my $existing_variation = $cols->[12]; return 1 if $consequence ne 'synonymous_variant' || $existing_variation eq '-' || $existing_variation =~ /COSM|TMP/; return 0; # don't want VEP result } #------------------------------------------------------------------------------- sub _get_hmds_refs { my $header = shift; # arrayref of hmds refs (in pairs) my $i = 0; # counter my @refs; for ( @$header[5 .. $#{$header} - 1] ) { next unless $i++ % 2; # increment then test for modulus 2 - get every other col my ($ref) = $_ =~ m!(\d+/\d{2})!; push @refs, $ref; } # warn Dumper \@refs; return \@refs; } #------------------------------------------------------------------------------- sub _get_form_options { # user submitted opts (eg sift, polyphen, etc) my $self = shift; my $params = $self->form_opts; # warn Dumper $form_opts; my @opts; # polyphen & sift take args (s, p or b) push @opts, '--polyphen=' . $params->{polyphen} if $params->{polyphen}; push @opts, '--sift=' . $params->{sift} if $params->{sift}; # check_existing, regulatory & coding_only take no args: for ( qw/check_existing coding_only regulatory/ ) { push @opts, "--$_" if $params->{$_}; } # warn Dumper \@opts; return \@opts; } #------------------------------------------------------------------------------- sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point return ( # 1st sort on chromosome, then start_point: ( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ ) ? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X : $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22 ) || $a->{start_point} <=> $b->{start_point}; } #------------------------------------------------------------------------------- # extract tab-delimited data from csv source file: sub _text_csv_slurp { my $self = shift; my $data = $self->src_data; # string my $csv = Text::CSV->new(\%text_csv_args); my @results; for ( split "\n", $data ) { # warn $_; if ( $csv->parse($_) ) { my @row = $csv->fields(); push @results, \@row; } } return \@results; } #------------------------------------------------------------------------------- # get reference table as hashref of gene => { chromosome, start_base }: sub _build_ref_table { my $self = shift; my $db = NGS::DB->new(); # warn $db->dbix; my $ref_tbl = $db->dbix->select('ref_seq', '*') ->map_hashes('gene'); # warn Dumper $ref_tbl; return $ref_tbl; } =begin _build_ref_table() from ref.csv my $ref = "$Bin/../ref.csv"; my $io = new IO::File; open( $io, '<', $ref ) || die $!; my $csv = Text::CSV->new(\%text_csv_args); my %h; while ( my $row = $csv->getline( $io ) ) { my ($ref_seq, $chr, $start) = @$row; $h{$ref_seq} = { chromosome => $chr, start_base => $start, }; } # warn Dumper \%h; return \%h; =cut 1;