package NGS::VEP; # wrapper around Ensembl Variant Effect Predictor stand-alone script; # http://www.ensembl.org/info/docs/variation/vep/vep_script.html use Moo; use IO::All; use IO::File; use Text::CSV; use Path::Tiny; use Log::Report; use Data::Printer alias => 'ddp'; use Sort::Naturally; use Capture::Tiny qw(:all); use Time::HiRes qw(gettimeofday tv_interval); use FindBin qw($Bin); # warn $Bin; use Modern::Perl '2012'; use lib '/home/raj/perl-lib'; use Local::Utils; use Local::MooX::Types qw(HashReference ArrayReference Object String); use NGS::DB; has errors => ( is => 'ro', isa => ArrayReference, default => sub { [] } ); # required to be passed on object contruction: has form_opts => ( is => 'ro', isa => HashReference, required => 1 ); has data_file => ( is => 'ro', isa => Object, required => 1 ); has config => ( is => 'ro', isa => HashReference, required => 1 ); # called by class methods: has ref_table => ( is => 'lazy' ); # _build_ref_table() has vep_input => ( is => 'lazy' ); # _build_vep_input() has transcripts => ( is => 'lazy' ); # _build_transcripts() # result counters: has result_counter => ( is => 'lazy' ); # _build_result_counter() returns hashref # locations table (include/exclude known start locations): has known_locations => ( is => 'lazy' ); # _build_known_locations() returns hashref map my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV # location of variable_effect_predictor script: my $vep_script = path($Bin, '..', 'script', 'variant_effect_predictor.pl'); my $debug = new IO::File; open $debug, '>', path($Bin, '..', 'debug.txt') || fault $!; $debug->autoflush(1); # to get past opening pipe to vep script use constant DEBUG => 0; # 1 (on), 0 (off) - also uses running env (requires devel) # list of data items required for VEP script input file: use constant VEP_FIELDS => qw(chromosome start_point end_point allele strand); # maximum request number allowed by SMALLINT(6) UNSIGNED: use constant MAX_REQNUM_SIZE => 32767 * 2; # list of cols and index positions returned by VEP script: my %VEP_COLS = ( exon_id => 0, location => 1, allele => 2, gene => 3, feature => 4, consequence => 6, cds => 8, protein => 9, aa => 10, variation => 12, extra => 13, ); # ============================================================================== sub roche_454 { my $self = shift; # p $self->database; # define vars ------------------------------------------------------------------ # var: $src_data # extract tab-delimited data from NGS csv data file my $src_data = $self->_text_csv_slurp(); # p $src_data; # arrayref # var: $header_row # remove 1st row (= headers) from $src_data my $header_row = shift @$src_data; # p $header_row; # var: $hmds_ref_numbers # get hmds sample refs from header row - in cols starting at col #6 my $hmds_ref_numbers = $self->_get_hmds_refs($header_row); # var: $ref_table # get chromosome & start points for exons - reference table from database my $ref_table = $self->ref_table; # p $ref_table; # var: @vep_input # pre-processed data for VEP input; list of hashrefs with keys = VEP_FIELDS # of 'accepted' rows from src_data file my @vep_input; # var: %exon_data_ref # hash map; keys = unique_id (chromosome_start-base_variant, eg 2_123456_C/T); # vals = @vep_field vals (chr, start, end, etc); acts as ref data for merging # with VEP script output my %exon_data_ref; # var: %exon_names # hash of unique exon names - checks that all are parsed correctly and exist # in %exon_data_ref (may not if different exons produce identical unique ids) my %exon_names; # run -------------------------------------------------------------------------- # var: $row # each row in NGS csv data array, cols = exon name, base/variant, status, etc ROW: for my $row (@$src_data) { # p $row; $self->result_counter->{src_rows}++; # p $self->result_counter; # skip row unless 3rd col (status field) reads 'accepted' next ROW unless lc $row->[2] eq 'accepted'; $self->result_counter->{accepted_src_rows}++; # p $self->result_counter; # var: $exon_name # sequence name in 1st col eg ASXL exon 12.7, JAK2 exon 12.1 my $exon_name = $row->[0]; # warn $exon_name; $exon_names{$exon_name}++; # var: $ref_data # reference data from database for exon name (skip row if no db entry) my $ref_data = $ref_table->{$exon_name}; # p $ref_data; if (! $ref_data) { my $str = $exon_name . ' | no ref table entry'; warning $str; # warn $str; next ROW; } # var: $chromosome # chromosome number for exon, derived from reference data my $chromosome = $ref_data->{chromosome}; # var: $variant # 2nd col is colon-separated base location & variant (eg 136:A/T, 26-27:TG/--) my $variant = $row->[1]; # warn $variant; # fix for FLT3-ITD 1 variant i(278.5,TGAGATCATATTCATATTTGAGATCATATTCATATT): unless ( $variant =~ /:/ ) { my $str = sprintf 'cannot split %s variant %s', $exon_name, $variant; warning $str; # warn $str; next ROW; } # var: $location, $allele # split base location & variant from $variant (eg 136:A/T, 26-27:TG/--) my ($location, $allele) = split ':', $variant; # p [$location, $allele]; # var: $start_point, $end_point # define initial start & end points as reference data start position minus 1 my $start_point = my $end_point = $ref_data->{start_base} - 1; # eg 125436276 # if base location > 1 base (eg 26-27, 134-137, etc): if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27 my $span = $2 - $1; # eg 26-27 will give span = 1 $start_point += $1; # ref tbl start pos + 1st capture (26 in above example) $end_point = $start_point + $span; } else { $end_point = $start_point += $location; } # p [ $start_point, $end_point ]; { # var: %h # hash data for vep script input file, keys = VEP_FIELDS my %h = ( chromosome => $chromosome, start_point => $start_point, end_point => $end_point, allele => $allele, # eg T/C, A/G, etc strand => '+', # required 5th col in VEP file ); # p \%h; push @vep_input, \%h; { # add exon name and id unique identifier for $exon_data_ref (don't # want this in vep_input); exon name + variant/location = unique ID $h{exon_name} = $exon_name; # eg "TET2 exon 4" my $exon_id = sprintf '%s [%s]', $exon_name, $variant; $h{exon_id} = $exon_id; # eg "TET2 exon 4 [891:C/T]" } # var: $unique_id # create ID matching return from VEP script, created using combination # of chromosome, start base number & allele, eg 2_12345_C/G my $unique_id = join '_', $chromosome, $start_point, $allele; $exon_data_ref{$unique_id} = \%h; { # get patient results in cols 6 onwards: my $i = 0; # reset counter to 0 for my $ref ( @$hmds_ref_numbers ) { # warn $ref; # var: $result # percentage positive result - in 1st col of each sample ref pair my $result = $row->[5 + $i]; # warn $result; # col 6, 8, 10, etc if ( $result > 0 ) { # skip zero's # var: $read_no # read number - in 2nd col of each sample ref pair my $read_no = $row->[5 + $i + 1]; # col 7, 9, 11, etc my %d = ( lab_no => $ref, result => $result, read => $read_no, ); # p \%d; # add sample data to $exon_data_ref{$unique_id} as array # (to allow same location/allele to have multiple samples): push @{ $exon_data_ref{$unique_id}{sample_data} }, \%d; }; $i += 2; # move 2 cols along } } } } # p \%exon_data_ref; $self->result_counter->{vep_input_rows} = scalar @vep_input; { # check all exon names are included in %exon_data_ref: my @seen = map $exon_data_ref{$_}->{exon_name}, keys %exon_data_ref; # delete all 'seen' exon names: delete @exon_names{ @seen }; # p \@seen; # add any missing exon names to errors: warning "$_ | omitted from VEP input" for keys %exon_names; } # sort data into chromosome order to (possibly) speed up vep processing: my @vep_fields = VEP_FIELDS; for ( sort by_chr_location @vep_input ) { my $line = join "\t", @{$_}{@vep_fields}; # warn $line; io($self->vep_input)->append($line . "\n"); } # send to function running VEP script: # var: $result # hashref of output data from VEP script (consequence, amino_acids, # sift, polyphen, etc), results of patient samples & non-cds rows my $result = $self->_run_vep(\%exon_data_ref); { # add counts, errors, etc: my %h = ( counts => $self->result_counter, # add result counters errors => $self->errors, ); $result->{data} = \%h; } io($self->vep_input)->unlink || fault "cannot unlink vep_input - $!"; return $result; } #------------------------------------------------------------------------------- sub miseq_vcf { my $self = shift; my $args = shift; # href outdir = str, files = array(ref) my $files = $args->{files}; my $tmp_dir = $args->{outdir}; # location of extracted files in /tmp # miseq run id = sub-directory name in /tmp dir (remove leading /tmp/): # my $miseq_id = substr($tmp_dir, 5); # warn $miseq_id; # not using my @vep_fields = VEP_FIELDS; # var: %exon_data_ref # hash map; keys = unique_id (chromosome_start-base_variant, eg 2_123456_C/T); # vals = @vep_field vals (chr, start, end, etc); acts as ref data for merging # with VEP script output my %exon_data_ref = (); my @vep = (); # reset array - to hold rows for vep input file my %seen; # to ensure unique gene/chromosome/start_point/allele combinations in vep FILE: for (@$files) { # ddp $_; my $io = io($_); my $file = $io->filename; # warn $file; next FILE unless $file =~ /vcf\Z/; # skip unless .vcf suffix # my $sample_id = join '/', $miseq_id, $file; # warn $sample_id; # not using my $lab_number = ( join '/', $file =~ /^[A-Z]?(\d+)-(\d+)/ ); # ddp $lab_number; unless ( $lab_number ) { warning join ' | ', $file, $self->config->{messages}->{filename}; next FILE; } my $data = $self->_parse_file($io); # p $data; # AoH ROW: for my $row (@$data) { # p $row; # hashref # create unique id: my $unique_id = join '_', @{$row}{qw(chromosome start_point allele)}; # use gene name + unique_id as alias for exon_id (exon_id not supplied by miseq): my $exon_id_alias = join '_', $row->{gene}, $unique_id; # p $exon_id_alias; $row->{exon_id} = $exon_id_alias; # warn $exon_id_alias; # add vcf data to %exon_data_ref under unique_id key (unless already exists): unless ( $exon_data_ref{$unique_id} ) { my %h = map +( $_ => $row->{$_} ), qw(chromosome start_point end_point allele exon_id); $exon_data_ref{$unique_id} = \%h; } { # add sample data to $exon_data_ref{$unique_id} sample_data array; # array allows multiple samples to have same location/allele: my %d = ( lab_no => $lab_number, filter => $row->{filter}, result => $row->{vf}, read => $row->{depth}, ); # p \%d; push @{ $exon_data_ref{$unique_id}{sample_data} }, \%d; } # only need each variant once in vep input: next ROW if $seen{$exon_id_alias}++; push @vep, [ @{$row}{@vep_fields} ]; } } $self->result_counter->{vep_input_rows} = scalar @vep; $self->_debug(exon_data_ref => \%exon_data_ref); # p \%seen; # send to function running VEP script: # write out vars as tab-delimited row to vep input file, Sort::Naturally by chr: =begin # this block will split vep input into separate chromosome inputs: for my $ref ( sort by_chromosome_and_start_pos @vep ) { my $row = join "\t", @$ref; my $chr = $ref->[0]; # warn $chr; my $file = join '_', $self->vep_input, $chr; # warn $file; io($file)->append($row . "\n"); } =cut for my $ref ( sort by_chromosome_and_start_pos @vep ) { my $row = join "\t", @$ref; io($self->vep_input)->append($row . "\n"); } # var: $result # hashref of output data from VEP script (consequence, amino_acids, # sift, polyphen, etc), results of patient samples & non-cds rows my $result = $self->_run_vep(\%exon_data_ref); # p $result; # get map of transcripts table and corresponding cds/protein vars from vep: my $transcripts = $self->transcripts; # p $transcript; { # add counts, errors, etc: # count files (linux includes directory in zip): my $file_count = grep { io($_)->type eq 'file' } @$files; # ddp $file_count; my %h = ( transcripts => $transcripts, variants => \%seen, # unique exon_id_aliases counts => $self->result_counter, # add result counters errors => $self->errors, total => $file_count, ); # p \%h; $result->{data} = \%h; } # save samples, VEP & non-CDS data to db for retrieval for download: $self->save_data($result); # unless $self->config->{environment} eq 'test'; # delete tmp files: # vep input file: if ( -e $self->vep_input ) { # undef if nothing returned from _parse_fields() io($self->vep_input)->unlink || fault "cannot unlink vep_input - $!"; } # src files: for (@$files) { # warn io($_)->type; next unless io($_)->type eq 'file'; # patch 7/10/15 for linux zip (contains dir) io($_)->unlink || fault "cannot unlink $_ - $!"; } # only delete tmp_dir if it's a sub-dir of /tmp (not /tmp itself): unless ( $tmp_dir eq '/tmp' ) { # eg single file or zip of single file io($tmp_dir)->rmdir || fault "cannot unlink $tmp_dir - $!"; } return $result; } #------------------------------------------------------------------------------- sub save_data { # duplicates effort done in tt so could return data-structure to tt my ($self, $result) = @_; my $environment = $self->config->{environment}; my $db = NGS::DB->new(debug => $debug, app_env => $environment ); # initialise session_id accessor in db object: my $sess_id = $self->config->{session_id}; # p $sess_id; # session id $db->session_id($sess_id); # my $t0 = [gettimeofday]; # for timing _validate_vep_data() { # sample data: my $this_yr = Local::Utils::this_year() - 2000; # ddp $this_yr; my $exon_ref = $result->{exon_ref}; my $variants = $result->{data}->{variants}; my $samples = $result->{samples}->{match}; # p $data; my $total = $result->{data}->{total}; my $defs = $db->get_col_defs('sample_data'); # ddp $defs; my @cols = @{ $db->get_cols('sample_data') }; my @d; SAMPLE: while ( my ($lab_no, $locations) = each %$samples ) { # str, HoH # ddp $lab_no; # p $locations; while ( my ($location, $data) = each %$locations ) { # str, AoH my $vep_data = $exon_ref->{$location}; # ddp $vep_data; # ddp $location; ddp $result->{data}; my $consequence = $vep_data->{consequence}; my $variation = $vep_data->{variation}; my $polyphen = $vep_data->{polyphen}; my $protein = $vep_data->{protein}; my $exon_id = $vep_data->{exon_id}; my $sift = $vep_data->{sift}; my $cds = $vep_data->{cds}; my $vars_seen = $variants->{$exon_id}; my $freq = sprintf '%s / %s [%2d%%]', # eg 45 / 48 [93.75%] $vars_seen, $total, ( 100 * $vars_seen / $total ); # ddp $freq; for my $ref(@$data) { # ddp $ref; # ddp $location; my $lab_no = $ref->{lab_no}; # warn $lab_no; my $result = $ref->{result} / 100; my $vep_no = $ref->{vep_row_count}; my $filter = $ref->{filter}; my $depth = $ref->{read}; my ($request_no, $yr) = split '/', $lab_no; # split for order by # validate lab-number before saving data: if ( $request_no > MAX_REQNUM_SIZE or $yr > $this_yr ) { warning "$lab_no | incorrect lab number format"; next SAMPLE; } my %h; @h{@cols} = ( $sess_id, # session_id $request_no, # request_number $yr + 2000, # year $exon_id, # exon_ref $vep_no, # vep_no $freq, # freq $result, # AF $depth, # depth $filter, # filter $consequence, # consequence $cds, # cds $protein, # protein $variation, # existing_variation $sift, # sift $polyphen, # polyphen ); push @d, \%h if _validate_vep_data(\%h, $defs); } } } if (@d) { my $tbl = 'sample_data'; $db->update_vep_table($tbl, \@d) or warning $db->error->{$tbl}; } } { # VEP data: my $data = $result->{vep_data}; # p $data; my @cols = @{ $db->get_cols('vep_data') }; my $defs = $db->get_col_defs('vep_data'); # ddp $defs; my @d = (); for my $aref (@$data) { # arrayref my %h; @h{@cols} = ( $sess_id, @$aref ); # ddp %h; # validate data var lengths against col_defs: push @d, \%h if _validate_vep_data(\%h, $defs); } if (@d) { my $tbl = 'vep_data'; $db->update_vep_table($tbl, \@d) or warning $db->error->{$tbl}; } } { # non-CDS data: my $data = $result->{non_cds}; # p $data; my @cols = @{ $db->get_cols('non_cds') }; # ddp @cols; my $defs = $db->get_col_defs('non_cds'); # ddp $defs; my @d = (); for my $href (@$data) { # hashref $href->{id} = $sess_id; my %h; $h{$_} = $href->{$_} for @cols; # only want data for tbl cols push @d, \%h if _validate_vep_data(\%h, $defs); } if (@d) { my $tbl = 'non_cds'; $db->update_vep_table($tbl, \@d) or warning $db->error->{$tbl}; } } # my $runtime = tv_interval $t0, [gettimeofday]; ddp $runtime; } #------------------------------------------------------------------------------- sub _parse_file { my ($self, $f) = @_; # io object my @rows = io($f->name)->slurp; # p @rows; my @data = (); # for AoH ROW: for (@rows) { next ROW if /^#/; # skip comments $self->result_counter->{src_rows}++; my @fields = split "\t", $_; # p @fields; my $vals = $self->_parse_fields(\@fields) || next ROW; # $vals = hashref push @data, $vals; # p $vals; } $self->result_counter->{accepted_src_rows} += scalar @data; return \@data; } #------------------------------------------------------------------------------- sub _parse_fields { my $self = shift; my $data = shift; # arrayref of tab-separated vcf row entries return 0 unless $data->[0] =~ /^chr/; # or rest of regexes will fail my $chr = $data->[0]; # warn $chr; # eg chrX, chr2, etc my $pos = $data->[1]; # warn $pos; # numerical position my $ref = $data->[3]; # warn $ref; # str of 1 or more A,C,T,G bases my $alt = $data->[4]; # warn $alt; # str of 1 or more A,C,T,G bases my $filter = $data->[6]; # warn $filter; # PASS, SB, R8, LowGQ, etc my $info = $data->[7]; # warn $info # DP, TI, GI, FC, GT, etc values my $results = $data->[9]; # warn $results; # colon-separated results (eg VF) my $vf = (split ':', $results)[3] || warn 'no VF info'; # warn $vf; my $known_locations_map = $self->known_locations; my ($depth) = $info =~ /DP=(\d+)/; # warn $depth; my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match my $allele = undef; # defined in block below # rules: exclude if start position matches exclusion list, depth < 100 or # VF < 5% (unless start position matches inclusion list): unless ( $known_locations_map->{include}->{$pos} ) { return 0 if $vf < 0.05 || $depth < 100 || $known_locations_map->{exclude}->{$pos} } # p $data; # define initial start & end positions = vcf 'position' var (2nd col): my $start_pos = my $end_pos = $pos; # define allele and refine start & end positions depending on ins/del/sub: if ( length $alt > 1 ) { # warn 'insertion'; # base insertion: $start_pos += 1; # end pos stays same $allele = sprintf '-/%s', substr($alt, 1); # alt minus 1st base } elsif ( length $ref > 1 ) { # warn 'deletion'; base deletion: $start_pos += 1; $end_pos += length($ref) - 1; $allele = sprintf '%s/-', substr($ref, 1); # ref minus 1st base } else { # warn 'substitution'; # alt & ref both single ie base substitution: $allele = sprintf '%s/%s', $ref, $alt; # warn $allele; } my @genes = do { # remove duplicate genes: eg TP53,TP53,TP53 no warnings 'uninitialized'; # eg if regex failed due to malformed str my %seen = map { $_ => 1 } split ',', $genes; keys %seen; }; # warn @genes; my $gene = join ', ', @genes; # || warn 'no GI entry'; my %h = ( chromosome => substr($chr, 3), # remove leading 'chr' chars start_point => $start_pos, end_point => $end_pos, allele => $allele, filter => $filter, depth => $depth, gene => $gene, vf => $vf * 100, # convert proportion to percent ); # p \%h; =begin # not using gene_strand table - all strands '+' GENE: for my $gene (@genes) { # + or - strand from gene_strand table: my $strand = $gene_strand->{$gene}; unless ($strand) { warn "$gene not in gene_strand table"; next GENE; } # p [ $chromosome, $start_pos, $end_pos, $allele, $strand ]; $h{strand} = $strand; } =cut $h{strand} = '+'; # not using gene_strand table return \%h; } #------------------------------------------------------------------------------- sub _run_vep { my $self = shift; # var: $exon_data_ref # hash map; keys = unique_id (chromosome:start-base, eg 2:123456); vals = # @vep_field vals (chr, start, end, etc); acts as ref data for merging with # VEP script output my $exon_data_ref = shift; $self->_debug(['exon_data_ref',$exon_data_ref]); # var: $vep_input # full path to vep input file my $vep_input_filename = $self->vep_input; # warn $vep_input_filename; # var: @results # container to hold tab-delimited results array from vep output my @results; =begin # this block handles separate chromosome vep input files: my @opts = ( "-config=$Bin/../script/vep.ini" ); for my $chr( 1 .. 23, 'X') { my $src = join '_', $self->vep_input, $chr; next unless -e $src; # open pipe to vep script (input = $vep_file; output = stdout): my $io = io->pipe("$vep_script @opts -i=$src"); # use IO::Detect; warn is_filehandle($io); while ( my $line = $io->getline ) { # warn $line; push @results, $line; } $self->_debug(['vep output',\@results]); # p \@results; } =cut # load opts from vep.ini + user-selected options to augment: my @opts = ( "-config=$Bin/../script/vep.ini", "-i=$vep_input_filename" ); { # enabled forking on deployment server (actually slower on dev; see 'top'): my $forks = $self->config->{fork}; # p $forks; push @opts, "--fork=$forks" if $forks; # development.ini = 0 } # p \@opts; my $user_opts = $self->_get_form_options; push @opts, @$user_opts; # p \@opts; # p io('/tmp/data.vep')->slurp; # from test scripts # open pipe to vep script (input = $vep_file; output = stdout): my $io = io->pipe("$vep_script @opts"); # use IO::Detect; warn is_filehandle($io); my $t0 = [gettimeofday]; # set t0 # start vep input ============================================================== { # wrap loop in capture_stderr block in case vep.pl issues a 'die' statement my $err = capture_stderr { # Capture::Tiny while ( my $line = $io->getline ) { # warn $line; push @results, $line; } }; $self->_add_to_errors($err) if $err; # p $err; } $self->_debug(['vep output',\@results]); # p \@results; # end vep input ================================================================ $self->result_counter->{runtime} = tv_interval $t0, [gettimeofday]; # script runtime # var: @input_fields # list of data items from $exon_data_ref which are needed for adding to VEP output my @input_fields = qw(exon_id chromosome start_point end_point); # var: @vep_data # array of composite of vep input & vep output data my @vep_data; # var: %seen # running tally of vep inputs with 1 or more results my %seen; my $transcripts = $self->transcripts; # p $transcripts; RESULT: for (@results) { # p $_; next RESULT if /^#/; # skip comment lines my @cols = split /\t/; # p @cols; # var: $vep_exon_id # unique identifier - 1st col of vep output - in same format as # %$exon_data_ref key (eg 1_123456_C/T) my $vep_exon_id = $cols[$VEP_COLS{exon_id}]; # warn $vep_exon_id; # var: $ref # $exon_data_ref hashref for $vep_exon_id (start, end, chr, allele, etc) if ( my $ref = $exon_data_ref->{$vep_exon_id} ) { # p ['ref',$ref]; $seen{$vep_exon_id}++; # vep input file has at least one result # get exon_id, chromosome, start_point & end_point vals into array: my @vars = map $ref->{$_}, @input_fields; # add allele, gene, feature, consequence, cds, protein, aa & existing # variation vars from vep row: push @vars, $cols[$VEP_COLS{$_}] for qw(allele gene feature consequence cds protein aa variation); # extract sift, polyphen, cds & protein vals from 'extra' col data: my $extra_data = $self->_get_extra_params( $cols[$VEP_COLS{extra}] ); # add to @vars: push @vars, $extra_data->{$_} for qw(sift polyphen extra); # combine vep input data (chr, start, end, exon_id) with vep output # data (sift, polyphen data, consequence, allele, etc): push @vep_data, \@vars; { # add consequence, existing variation, sift, polyphen & exon_id to # %exon_data_ref{$vep_exon_id}: my %h = ( exon_id => $ref->{exon_id} ); { # add cds & protein if gene/feature combination in $transcripts: my ($gene, $feature) = @cols[@VEP_COLS{'gene','feature'}]; my $transcripts_gene_feature = $transcripts->{$gene} || warning $gene . ' | not in transcripts db'; no warnings 'uninitialized'; # in case ! $transcripts_gene_feature if ( $transcripts_gene_feature eq $feature ) { # add consequence, variation, gene & feature from vep data: $h{$_} = $cols[$VEP_COLS{$_}] for qw(consequence variation); # add sift, polyphen, cds & protein vars from vep 'extra' col: $h{$_} = $extra_data->{$_} for qw(sift polyphen cds protein); } # p \%h; } # p \%h; { # check location infos match before merging: my $vep_location = $cols[$VEP_COLS{location}]; # warn $vep_location; my ($start, $end) = map $ref->{$_}, qw(start_point end_point); my $loc = do { if ( $end < $start ) { join '-', $end, $start } # insertion elsif ( $end > $start ) { join '-', $start, $end } # deletion else { $start } # substitution }; # warn $loc; my $ref_location = sprintf '%s:%s', $ref->{chromosome}, $loc; # p [$vep_location,$ref_location]; unless ( $vep_location eq $ref_location ) { my $str = sprintf 'vep location (%s) != ref_location (%s)', $vep_location, $ref_location; warning $str; # warn $str; } } # merge %h with $exon_data_ref entry for $vep_exon_id: map $exon_data_ref->{$vep_exon_id}->{$_} = $h{$_}, keys %h; } # var: $sample_data # sample data hashref from exon_data_ref for chr_location if ( my $sample_data = $ref->{sample_data} ) { # arrayref # examines vep consequence & existing_variation cols to determine # whether to use vep result for analysis of sample_data my $is_required = _want_vep_result(\@cols); # returns 1 or 0 for (@$sample_data) { $_->{vep_row_count}++; # for vep_exon_id freq. in VEP results # flag to indicate whether to use result (multiple overwrites OK): $_->{use_vep_result} = $is_required; # 1 or 0 } } # p $ref->{sample_data}; } else { warn "no match in src file for $_" } # no match in src file } $self->_debug(['post VEP exon_data_ref', $exon_data_ref]); # has no vep result: my @orphans = sort by_chr_location map $exon_data_ref->{$_}, grep { ! $seen{$_} } keys %$exon_data_ref; # var: $samples # process sample_data in $exon_data_ref my $samples = $self->_process_sample_results($exon_data_ref); # returns hashref return { exon_ref => $exon_data_ref, vep_data => \@vep_data, non_cds => \@orphans, samples => $samples, } } #------------------------------------------------------------------------------- # get sift & polyphen data from 'extra' col - also gets cds & protein annotations: sub _get_extra_params { my ($self, $str) = @_; # warn $str; chomp $str; # warn $str; # regex to capture prediction and/or score eg SIFT=deleterious(0.02); # PolyPhen=probably_damaging; SIFT=0.002: / (\w+)? ( \(? [\d\.]+ \)? )? /x my $re = qr/ (\w+)? # optional 'prediction' (eg benign, deleterious, etc) ( # start of optional 'score' \(? # optional open bracket (only if prediction AND score selected) [\d\.]+ # any number of 0-9 & decimal point chars (eg 0.012) \)? # optional close bracket (only if prediction AND score selected) )? # end of optional 'score' /x; my %h; # capture sift, polyphen, cds & protein entries if ( $str =~ /sift=($re)/i ) { $h{sift} = $1; # warn $h{sift}; } if ( $str =~ /polyphen=($re)/i ) { $h{polyphen} = $1; # warn $h{polyphen}; } # p \%h; map { # remove captured items within $re from $str: my $quoted = quotemeta $_; # warn $quoted; $str =~ s/(sift|polyphen)=$quoted(;?)//i; # remove trailing semi-colon } grep $_, @h{ qw(sift polyphen) }; # warn $str; # !! grep $_ or empty vals cause problems # capture cds & protein vars - MUST happen after sift & polyphen entries removed: if ( my @parts = split ';', $str ) { # get cds & protein from $str: # STRAND=1;HGVSc=ENST00000305737.2:c.2285_2286insC;HGVSp=ENSP00000306705.2:p.Gln764ProfsTer5 my ($x,$c,$p); # will discard $x both times: if ( my $str = $parts[1] ) { # 2nd item, split on colon: ($x,$c) = split ':', $str; } if ( my $str = $parts[2] ) { # 3rd item, split on colon: ($x,$p) = split ':', $parts[2]; # 3rd item, split on colon } @h{'cds','protein'} = ($c,$p); } # p \%h; $h{extra} = $str; # p \%h; return \%h; } #------------------------------------------------------------------------------- sub _process_sample_results { my ($self, $data) = @_; # p $data; my %h; while ( my($location, $d) = each %$data ) { if ( my $sample_data = $d->{sample_data} ) { # p $sample_data; # $sample_data is array(ref): for my $ref (@$sample_data) { # p $ref; # transform $hmds_ref '/' to '_' for automatic tt sorting: my $lab_no = join '_', reverse split '/', $ref->{lab_no}; $ref->{hmds_ref} = $lab_no; # warn $lab_no; if ( $ref->{use_vep_result} ) { push @{ $h{match}{$lab_no}{$location} }, $ref; $self->result_counter->{matches}++; } else { push @{ $h{no_match}{$lab_no}{$location} }, $ref; $self->result_counter->{no_match}++; } } } } $self->_debug('sorted_sample_results', \%h); # p \%h; return \%h; } #------------------------------------------------------------------------------- sub by_hmds_ref { my ($r1, $y1) = split '/', $a; # p [$r1, $y1]; my ($r2, $y2) = split '/', $b; # p [$r2, $y2]; return $y1 <=> $y2 || $r1 <=> $r2; } #------------------------------------------------------------------------------- sub _want_vep_result { # caller wants 1 or 0 return val (not undef): my $cols = shift; # arrayref my $consequence = $cols->[6]; # p [ @{$cols}[6,12] ]; my $variation = $cols->[12]; return 1 if ( $variation =~ /COSM/ # want these regardless of consequence # reject if 'synonymous_variant', accept if variation matches /TMP|^-: || ( $variation =~ /TMP|^-/ && $consequence ne 'synonymous_variant' ) ); return 0; # don't want it # return 1 if $variation =~ /COSM/; # want these regardless of consequence # return 0 if $consequence eq 'synonymous_variant'; # don't want these # return ( $variation =~ /TMP|^-/ ); } #------------------------------------------------------------------------------- sub _get_hmds_refs { my ($self, $header) = @_; # p $header; # arrayref of hmds refs (in pairs) my $i = 0; # counter my @refs; # for cols 6 to last take the 1st col of each pair and extract lab_no: # $#{$header} = same as (scalar @array - 1) for ( @$header[5 .. $#{$header} - 1] ) { # p [$_, $i]; next if $i++ % 2; # increment then test for modulus 2 - get every other col if ( my ($ref) = $_ =~ m!(\d+/\d{2})! ) { push @refs, $ref; } else { my $str = sprintf q!cannot find lab number in col header '%s'!, $_; warning $str; } } # p \@refs; return \@refs; } #------------------------------------------------------------------------------- sub _get_form_options { # user submitted opts (eg sift, polyphen, etc) my $self = shift; my $params = $self->form_opts; # p $params; my $transcript_db = $params->{db}; # refseq, merged, core my @opts = ( '--dir=/home/raj/.vep/' . $transcript_db ); ARG1: # polyphen & sift take args (s, p or b) for my $arg( qw/polyphen sift/ ) { my $val = $params->{$arg} || next ARG1; # param has a value push @opts, sprintf '--%s=%s', $arg, $val; } ARG2: # hgvs, check_existing, regulatory & coding_only take no args: for my $arg( qw/check_existing coding_only regulatory hgvs/ ) { $params->{$arg} || next ARG2; # param = 1 or 0 push @opts, sprintf '--%s', $arg; } # p \@opts; return \@opts; } #------------------------------------------------------------------------------- # checks length of data value not > max permitted for db col: sub _validate_vep_data { # no detectable performance issue on dev for 7801 src file rows my ($data, $col_def) = @_; # ddp $col_def; COL: while ( my($col, $val) = each %$data ) { # say "$col = $val"; my $max_length = $col_def->{$col}; next COL unless $val && $max_length; # because validation not required my $val_length = length $val; # say "length: $val_length; max: $max_length"; if ( $val_length > $max_length ) { # say "$val_length > $max_length"; warning "$col | length $val_length > max allowed [$max_length]"; return 0; } } return 1; # ok validated } #------------------------------------------------------------------------------- sub _add_to_errors { my ($self, $str) = @_; push @{ $self->errors }, $str; # or could use Log::Report::warning $str; } #------------------------------------------------------------------------------- sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point return ( # 1st sort on chromosome, then start_point: ( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ ) ? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X : $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22 ) || $a->{start_point} <=> $b->{start_point}; } #------------------------------------------------------------------------------- sub by_chromosome_and_start_pos { # Sort::Naturally ncmp function - same as above: return ncmp($a->[0], $b->[0]) || ncmp($a->[1], $b->[1]); } #------------------------------------------------------------------------------- # extract tab-delimited data from csv source file: sub _text_csv_slurp { my $self = shift; my $data = $self->data_file->content; # warn $data; # string my $csv = Text::CSV->new(\%text_csv_args); my @results; for ( split "\n", $data ) { # warn $_; if ( $csv->parse($_) ) { my @row = $csv->fields(); push @results, \@row; } } # p \@results; return \@results; } #------------------------------------------------------------------------------- # get reference table as hashref of gene => { chromosome, start_base }: sub _build_ref_table { NGS::DB->new->ref_table } #------------------------------------------------------------------------------- sub _build_known_locations { NGS::DB->new->known_locations } #------------------------------------------------------------------------------- sub _build_transcripts { NGS::DB->new->transcripts } #------------------------------------------------------------------------------- sub _build_vep_input { my $self = shift; # create new temp file for vep input data: my $filename = $self->form_opts->{data_src}; # p $filename; =begin # fix for test_scripts if doing: $mech->field('data_src', $Bin .'/data/myseq.zip'); when data_src = /full/path/to/filename (expects just filename): if ($ENV{HARNESS_ACTIVE}) { # p %ENV; $filename = io($filename)->filename; # p $filename; } =cut $filename =~ s/(txt|zip|vcf)\Z/vep/; # .txt -> .vep $filename =~ s/\s/_/g; # spaces -> underscores my $path_to_file = '/tmp/'.$filename; # delete if already exists (eg due to aborted previous run) as using append mode: io($path_to_file)->unlink if -e $path_to_file; return $path_to_file; } #------------------------------------------------------------------------------- sub _build_result_counter { my $self = shift; my %h = map +($_ => 0), qw( matches no_match vep_input_rows src_rows accepted_src_rows runtime ); return \%h; } #------------------------------------------------------------------------------- sub _debug { # only debugging dev server output (env added in post => '/vep' sub): return 0 unless DEBUG && shift->config->{environment} eq 'development'; print $debug ddp @_; } =begin _build_ref_table() from ref.csv my $ref = "$Bin/../ref.csv"; my $io = new IO::File; open( $io, '<', $ref ) || die $!; my $csv = Text::CSV->new(\%text_csv_args); my %h; while ( my $row = $csv->getline( $io ) ) { my ($ref_seq, $chr, $start) = @$row; $h{$ref_seq} = { chromosome => $chr, start_base => $start, }; } # p \%h; return \%h; =cut 1;