P3X GitList

package NGS::VEP;

# wrapper around Ensembl Variant Effect Predictor stand-alone script;
# http://www.ensembl.org/info/docs/variation/vep/vep_script.html

use Moo;
use IO::All;
use IO::File;
use Text::CSV;
use Path::Tiny;
use Data::Printer alias => 'ddp';
use Sort::Naturally;
use Capture::Tiny qw(:all);
use Time::HiRes qw(gettimeofday tv_interval);

use FindBin qw($Bin); # warn $Bin;
use Modern::Perl '2012';

use lib '/home/raj/perl-lib';
use Local::MooX::Types qw(HashReference ArrayReference Object String);

use NGS::DB;

has errors => ( is => 'ro', isa => ArrayReference, default => sub { [] } );

# required to be passed on object contruction:
has form_opts => ( is  => 'ro', isa => HashReference, required => 1 );
has data_file => ( is  => 'ro', isa => Object, required => 1 );
has config    => ( is  => 'ro', isa => HashReference, required => 1 );

# called by class methods:
has ref_table   => ( is => 'lazy' ); # _build_ref_table()
has vep_input   => ( is => 'lazy' ); # _build_vep_input()
has transcripts => ( is => 'lazy' ); # _build_transcripts()

# result counters:
has result_counter => ( is => 'lazy' ); # _build_result_counter() returns hashref
# locations table (include/exclude known start locations):
has known_locations => ( is => 'lazy' ); # _build_known_locations() returns hashref map

my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV

# location of variable_effect_predictor script:
my $vep_script = path($Bin, '..', 'script', 'variant_effect_predictor.pl');

my $debug = new IO::File;
open $debug, '>', path($Bin, '..', 'debug.txt') || die $!;
$debug->autoflush(1); # to get past opening pipe to vep script

use constant DEBUG => 1; # 1 (on), 0 (off) - also uses running env (requires devel)
# list of data items required for VEP script input file:
use constant VEP_FIELDS => qw(chromosome start_point end_point allele strand);

# list of cols and index positions returned by VEP script:
my %VEP_COLS = (
    exon_id     => 0,
    location    => 1,
    allele 	    => 2,
    gene        => 3,
    feature     => 4,
    consequence => 6,
    cds         => 8,
    protein     => 9,
    aa          => 10,
    variation   => 12,
    extra       => 13,
);

# ==============================================================================
sub roche_454 {
    my $self = shift; # p $self->database;

# define vars ------------------------------------------------------------------
    # var: $src_data
    # extract tab-delimited data from NGS csv data file
    my $src_data = $self->_text_csv_slurp(); # p $src_data; # arrayref

    # var: $header_row
	# remove 1st row (= headers) from $src_data
	my $header_row = shift @$src_data; # p $header_row;

	# var: $hmds_ref_numbers
    # get hmds sample refs from header row - in cols starting at col #6
	my $hmds_ref_numbers = $self->_get_hmds_refs($header_row);

    # var: $ref_table
    # get chromosome & start points for exons - reference table from database
    my $ref_table = $self->ref_table; # p $ref_table;

    # var: @vep_input
    # pre-processed data for VEP input; list of hashrefs with keys = VEP_FIELDS
    # of 'accepted' rows from src_data file
    my @vep_input;

    # var: %exon_data_ref
    # hash map; keys = unique_id (chromosome_start-base_variant, eg 2_123456_C/T);
	# vals = @vep_field vals (chr, start, end, etc); acts as ref data for merging
	# with VEP script output
    my %exon_data_ref;

	# var: %exon_names
	# hash of unique exon names - checks that all are parsed correctly and exist
	# in %exon_data_ref (may not if different exons produce identical unique ids)
	my %exon_names;

# run --------------------------------------------------------------------------
    # var: $row
    # each row in NGS csv data array, cols = exon name, base/variant, status, etc
    ROW: for my $row (@$src_data) { # p $row;
		$self->result_counter->{src_rows}++; # p $self->result_counter;
        # skip row unless 3rd col (status field) reads 'accepted'
        next ROW unless lc $row->[2] eq 'accepted';
		$self->result_counter->{accepted_src_rows}++; # p $self->result_counter;

        # var: $exon_name
        # sequence name in 1st col eg ASXL exon 12.7, JAK2 exon 12.1
        my $exon_name = $row->[0]; # warn $exon_name;
		$exon_names{$exon_name}++;

        # var: $ref_data
        # reference data from database for exon name (skip row if no db entry)
        my $ref_data = $ref_table->{$exon_name}; # p $ref_data;
        if (! $ref_data) {
			my $str = 'no ref table entry for ' . $exon_name;
			$self->_add_to_errors($str); # warn $str;
			next ROW;
        }

        # var: $chromosome
        # chromosome number for exon, derived from reference data
        my $chromosome = $ref_data->{chromosome};

        # var: $variant
        # 2nd col is colon-separated base location & variant (eg 136:A/T, 26-27:TG/--)
        my $variant = $row->[1]; # warn $variant;

        # fix for FLT3-ITD 1 variant i(278.5,TGAGATCATATTCATATTTGAGATCATATTCATATT):
        unless ( $variant =~ /:/ ) {
			my $str = sprintf 'cannot split %s variant %s', $exon_name, $variant;
			$self->_add_to_errors($str); # warn $str;
			next ROW;
        }

        # var: $location, $allele
        # split base location & variant from $variant (eg 136:A/T, 26-27:TG/--)
        my ($location, $allele) = split ':', $variant; # p [$location, $allele];

        # var: $start_point, $end_point
        # define initial start & end points as reference data start position minus 1
        my $start_point = my $end_point = $ref_data->{start_base} - 1; # eg 125436276

        # if base location > 1 base (eg 26-27, 134-137, etc):
        if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27
			my $span = $2 - $1; # eg 26-27 will give span = 1
            $start_point += $1; # ref tbl start pos + 1st capture (26 in above example)
            $end_point = $start_point + $span;
        }
        else {
            $end_point = $start_point += $location;
        } # p [ $start_point, $end_point ];

        {
            # var: %h
            # hash data for vep script input file, keys = VEP_FIELDS
            my %h = (
                chromosome  => $chromosome,
                start_point => $start_point,
                end_point   => $end_point,
                allele      => $allele, # eg T/C, A/G, etc
                strand      => '+', # required 5th col in VEP file
            ); # p \%h;
            push @vep_input, \%h;

			{ # add exon name and id unique identifier for $exon_data_ref (don't
				# want this in vep_input); exon name + variant/location = unique ID
				$h{exon_name} = $exon_name; # eg "TET2 exon 4"
				my $exon_id = sprintf '%s [%s]', $exon_name, $variant;
				$h{exon_id} = $exon_id; # eg "TET2 exon 4 [891:C/T]"
			}

            # var: $unique_id
            # create ID matching return from VEP script, created using combination
			# of chromosome, start base number & allele, eg 2_12345_C/G
            my $unique_id = join '_', $chromosome, $start_point, $allele;
            $exon_data_ref{$unique_id} = \%h;

            { # get patient results in cols 6 onwards:
				my $i = 0; # reset counter to 0
				for my $ref ( @$hmds_ref_numbers ) { # warn $ref;
                    # var: $result
					# percentage positive result - in 1st col of each sample ref pair
					my $result = $row->[5 + $i]; # warn $result; # col 6, 8, 10, etc
					if ( $result > 0 ) { # skip zero's
                        # var: $read_no
                        # read number - in 2nd col of each sample ref pair
						my $read_no = $row->[5 + $i + 1]; # col 7, 9, 11, etc

                        my %d = (
                            lab_no => $ref,
                            result => $result,
                            read   => $read_no,
                        ); # p \%d;
                        # add sample data to $exon_data_ref{$unique_id} as array
                        # (to allow same location/allele to have multiple samples):
                        push @{ $exon_data_ref{$unique_id}{sample_data} }, \%d;
                    };
					$i += 2; # move 2 cols along
				}
			}
        }
    } #  p \%exon_data_ref;
    $self->result_counter->{vep_input_rows} = scalar @vep_input;

	{ # check all exon names are included in %exon_data_ref:
		my @seen = map $exon_data_ref{$_}->{exon_name}, keys %exon_data_ref;
		# delete all 'seen' exon names:
		delete @exon_names{ @seen }; # p \@seen;
		# add any missing exon names to errors:
		$self->_add_to_errors("$_ omitted from VEP input") for keys %exon_names;
	}

	# sort data into chromosome order to (possibly) speed up vep processing:
    my @vep_fields = VEP_FIELDS;
	for ( sort by_chr_location @vep_input ) {
		my $line = join "\t", @{$_}{@vep_fields}; # warn $line;
		io($self->vep_input)->append($line . "\n");
	}

	# send to function running VEP script:

    # var: $result
    # hashref of output data from VEP script (consequence, amino_acids,
    # sift, polyphen, etc), results of patient samples & non-cds rows
	my $result = $self->_run_vep(\%exon_data_ref);
    { # add counts, errors, etc:
        my %h = (
            counts => $self->result_counter, # add result counters
            errors => $self->errors,
        );
        $result->{data} = \%h;
    }

	io($self->vep_input)->unlink || die "cannot unlink vep_input - $!";
	return $result;
}

#-------------------------------------------------------------------------------
sub miseq_vcf {
    my $self = shift;
    my $args = shift; # href outdir = str, files = array(ref)

    my $files   = $args->{files};
    my $tmp_dir = $args->{outdir}; # location of extracted files in /tmp

    # miseq run id = sub-directory name in /tmp dir (remove leading /tmp/):
    # my $miseq_id = substr($tmp_dir, 5); # warn $miseq_id; # not using

    my @vep_fields = VEP_FIELDS;

    # var: %exon_data_ref
    # hash map; keys = unique_id (chromosome_start-base_variant, eg 2_123456_C/T);
	# vals = @vep_field vals (chr, start, end, etc); acts as ref data for merging
	# with VEP script output
    my %exon_data_ref = ();

    my @vep = (); # reset array - to hold rows for vep input file
    my %seen; # to ensure unique gene/chromosome/start_point/allele combinations in vep

	FILE: for (@$files) { # p $_;
        my $io = io($_);

        my $file = $io->filename; # warn $file;
		next FILE unless $file =~ /vcf\Z/; # skip unless .vcf suffix
        # my $sample_id = join '/', $miseq_id, $file; # warn $sample_id; # not using

        my $lab_number = ( join '/', $file =~ /^[A-Z]?(\d+)-(\d+)/ )
        || die $self->config->{messages}->{filename}; # ddp $lab_number;

		my $data = $self->_parse_file($io); # p $data; # AoH
        ROW: for my $row (@$data) { # p $row; # hashref
            # create unique id:
            my $unique_id = join '_', @{$row}{qw(chromosome start_point allele)};

            # use gene name + unique_id as alias for exon_id (exon_id not supplied by miseq):
            my $exon_id_alias = join '_', $row->{gene}, $unique_id; # p $exon_id_alias;
            $row->{exon_id} = $exon_id_alias; # warn $exon_id_alias;

            # add vcf data to %exon_data_ref under unique_id key (unless already exists):
            unless ( $exon_data_ref{$unique_id} ) {
                my %h = map +( $_ => $row->{$_} ),
                    qw(chromosome start_point end_point allele exon_id);
                $exon_data_ref{$unique_id} = \%h;
            }

            { # add sample data to $exon_data_ref{$unique_id} sample_data array;
              # array allows multiple samples to have same location/allele:
                my %d = (
                    lab_no => $lab_number,
                    filter => $row->{filter},
                    result => $row->{vf},
                    read   => $row->{depth},
                ); # p \%d;
                push @{ $exon_data_ref{$unique_id}{sample_data} }, \%d;
            }
            # only need each variant once in vep input:
            next ROW if $seen{$exon_id_alias}++;
            push @vep, [ @{$row}{@vep_fields} ];
        }
	}
    $self->result_counter->{vep_input_rows} = scalar @vep;
    $self->_debug(exon_data_ref => \%exon_data_ref); # p \%seen;

# send to function running VEP script:
    # write out vars as tab-delimited row to vep input file, Sort::Naturally by chr:
=begin # this block will split vep input into separate chromosome inputs:
    for my $ref ( sort by_chromosome_and_start_pos @vep ) {
        my $row = join "\t", @$ref;
        my $chr = $ref->[0]; # warn $chr;
        my $file = join '_', $self->vep_input, $chr; # warn $file;
        io($file)->append($row . "\n");
    }
=cut
    for my $ref ( sort by_chromosome_and_start_pos @vep ) {
        my $row = join "\t", @$ref;
        io($self->vep_input)->append($row . "\n");
    }

    # var: $result
    # hashref of output data from VEP script (consequence, amino_acids,
    # sift, polyphen, etc), results of patient samples & non-cds rows
	my $result = $self->_run_vep(\%exon_data_ref); # p $result;

    # get map of transcripts table and corresponding cds/protein vars from vep:
    my $transcripts = $self->transcripts; # p $transcript;

    { # add counts, errors, etc:
        my %h = (
            transcripts => $transcripts,
            variants    => \%seen, # unique exon_id_aliases
            total       => scalar @$files,
            counts      => $self->result_counter, # add result counters
            errors      => $self->errors,
        ); # p \%h;
        $result->{data} = \%h;
    }

	# save samples, VEP & non-CDS data to db for retrieval for download:
    $self->save_data($result) unless $self->config->{environment} eq 'test';

# delete tmp files:
    # vep input file:
    if ( -e $self->vep_input ) { # undef if nothing returned from _parse_fields()
        io($self->vep_input)->unlink || die "cannot unlink vep_input - $!";
    }
    # src files:
    for (@$files) { # warn $_;
        io($_)->unlink || die "cannot unlink $_ - $!";
    }
    # only delete tmp_dir if it's a sub-dir of /tmp (not /tmp itself):
    unless ( $tmp_dir eq '/tmp' ) { # eg single file or zip of single file
        io($tmp_dir)->rmdir || die "cannot unlink $tmp_dir - $!";
    }

	return $result;
}

#-------------------------------------------------------------------------------
sub save_data { # duplicates effort done in tt so could return data-structure to tt
    my ($self, $result) = @_;

    my $db = NGS::DB->new(debug => $debug); # connection to mysql ngs db

    # initialise session_id accessor in db object:
    my $sess_id = $self->config->{session_id}; # p $sess_id; # session id
    $db->session_id($sess_id);

    { # sample data:
        my $exon_ref = $result->{exon_ref};
        my $variants = $result->{data}->{variants};
        my $samples  = $result->{samples}->{match}; # p $data;
        my $total    = $result->{data}->{total};
        my @cols     = @{ $db->get_cols('sample_data') };

        my @d;
        while ( my ($lab_no, $locations) = each %$samples ) { # str, HoH
            # ddp $lab_no; # p $locations;
            while ( my ($location, $data) = each %$locations ) { # str, AoH
                my $vep_data = $exon_ref->{$location}; # ddp $vep_data;
                    # ddp $location; ddp $result->{data};

                my $consequence = $vep_data->{consequence};
                my $variation   = $vep_data->{variation};
                my $polyphen    = $vep_data->{polyphen};
                my $protein     = $vep_data->{protein};
                my $exon_id     = $vep_data->{exon_id};
                my $sift        = $vep_data->{sift};
                my $cds         = $vep_data->{cds};

                my $vars_seen = $variants->{$exon_id};

                my $freq = sprintf '%s / %s [%2d%%]', # eg 45 / 48 [93.75%]
                    $vars_seen, $total, ( 100 * $vars_seen / $total ); # ddp $freq;

                for my $ref(@$data) { # ddp $ref; # ddp $location;
                    my $lab_no = $ref->{lab_no}; # warn $lab_no;
                    my $result = $ref->{result} / 100;
                    my $vep_no = $ref->{vep_row_count};
                    my $filter = $ref->{filter};
                    my $depth  = $ref->{read};

                    my ($request_no, $yr) = split '/', $lab_no; # split for order by

                    my %h; @h{@cols} = (
                        $sess_id,       # session_id
                        $request_no,    # request_number
                        $yr + 2000,     # year
                        $exon_id,       # exon_ref
                        $vep_no,        # vep_no
                        $freq,          # freq
                        $result,        # AF
                        $depth,         # depth
                        $filter,        # filter
                        $consequence,   # consequence
                        $cds,           # cds
                        $protein,       # protein
                        $variation,     # existing_variation
                        $sift,          # sift
                        $polyphen,      # polyphen
                    );
                    push @d, \%h;
                }
            }
        }
        if (@d) {
            my $tbl = 'sample_data';
            $db->update_vep_table($tbl, \@d)
            || $self->_add_to_errors( $db->error->{$tbl} );
        }
    }
    { # VEP data:
        my $data = $result->{vep_data}; # p $data;
        my @cols = @{ $db->get_cols('vep_data') };

        my @d = ();
        for my $aref (@$data) { # arrayref
            my %h; @h{@cols} = ( $sess_id, @$aref ); # p %h;
                push @d, \%h;
        }
        if (@d) {
            my $tbl = 'vep_data';
            $db->update_vep_table($tbl, \@d)
            || $self->_add_to_errors( $db->error->{$tbl} );
        }
    }
    { # non-CDS data:
        my $data = $result->{non_cds}; # p $data;
        my @cols = @{ $db->get_cols('non_cds') };

        my @d = ();
        for my $href (@$data) { # hashref
            $href->{id} = $sess_id;
            my %h; $h{$_} = $href->{$_} for @cols; # only want data for tbl cols
                push @d, \%h;
        }
        if (@d) {
            my $tbl = 'non_cds';
            $db->update_vep_table($tbl, \@d)
            || $self->_add_to_errors( $db->error->{$tbl} );
        }
    }
}

#-------------------------------------------------------------------------------
sub _parse_file {
    my ($self, $f) = @_; # io object

	my @rows = io($f->name)->slurp; # p @rows;

	my @data = (); # for AoH
	ROW: for (@rows) {
		next ROW if /^#/; # skip comments
		$self->result_counter->{src_rows}++;

        my @fields = split "\t", $_; # p @fields;
        my $vals = $self->_parse_fields(\@fields) || next ROW; # $vals = hashref
        push @data, $vals; # p $vals;
	}
    $self->result_counter->{accepted_src_rows} += scalar @data;
    return \@data;
}

#-------------------------------------------------------------------------------
sub _parse_fields {
    my $self = shift;
    my $data = shift; # arrayref of tab-separated vcf row entries

    return 0 unless $data->[0] =~ /^chr/; # or rest of regexes will fail

	my $chr     = $data->[0]; # warn $chr; # eg chrX, chr2, etc
    my $pos     = $data->[1]; # warn $pos; # numerical position
	my $ref     = $data->[3]; # warn $ref; # str of 1 or more A,C,T,G bases
	my $alt     = $data->[4]; # warn $alt; # str of 1 or more A,C,T,G bases
	my $filter  = $data->[6]; # warn $filter; # PASS, SB, R8, LowGQ, etc
    my $info    = $data->[7]; # warn $info # DP, TI, GI, FC, GT, etc values
	my $results = $data->[9]; # warn $results; # colon-separated results (eg VF)

	my $vf = (split ':', $results)[3] || warn 'no VF info'; # warn $vf;

    my $known_locations_map = $self->known_locations;

	my ($depth) = $info =~ /DP=(\d+)/; # warn $depth;
    my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match
    my $allele  = undef; # defined in block below

    # rules: exclude if start position matches exclusion list, depth < 100 or
    # VF < 5% (unless start position matches inclusion list):
    unless ( $known_locations_map->{include}->{$pos} ) {
        return 0 if
            $vf < 0.05 || $depth < 100 || $known_locations_map->{exclude}->{$pos}
    } # p $data;

    # define initial start & end positions = vcf 'position' var (2nd col):
    my $start_pos = my $end_pos = $pos;

    # define allele and refine start & end positions depending on ins/del/sub:
    if ( length $alt > 1 ) { # warn 'insertion'; # base insertion:
        $start_pos += 1; # end pos stays same
        $allele = sprintf '-/%s', substr($alt, 1); # alt minus 1st base
    }
    elsif ( length $ref > 1 ) { # warn 'deletion'; base deletion:
        $start_pos += 1;
        $end_pos += length($ref) - 1;
        $allele = sprintf '%s/-', substr($ref, 1); # ref minus 1st base
    }
    else { # warn 'substitution'; # alt & ref both single ie base substitution:
        $allele = sprintf '%s/%s', $ref, $alt; # warn $allele;
    }

    my @genes = do { # remove duplicate genes: eg TP53,TP53,TP53
        no warnings 'uninitialized'; # eg if regex failed due to malformed str
        my %seen = map { $_ => 1 } split ',', $genes;
        keys %seen;
    }; # warn @genes;
    my $gene = join ', ', @genes; #  || warn 'no GI entry';

    my %h = (
        chromosome  => substr($chr, 3), # remove leading 'chr' chars
        start_point => $start_pos,
        end_point   => $end_pos,
        allele      => $allele,
        filter      => $filter,
        depth       => $depth,
        gene        => $gene,
        vf          => $vf * 100, # convert proportion to percent
    ); # p \%h;

=begin # not using gene_strand table - all strands '+'
    GENE: for my $gene (@genes) {
        # + or - strand from gene_strand table:
        my $strand = $gene_strand->{$gene};
        unless ($strand) {
            warn "$gene not in gene_strand table";
            next GENE;
        } # p [ $chromosome, $start_pos, $end_pos, $allele, $strand ];
        $h{strand} = $strand;
    }
=cut
    $h{strand} = '+'; # not using gene_strand table
    return \%h;
}

#-------------------------------------------------------------------------------
sub _run_vep {
    my $self = shift;

    # var: $exon_data_ref
    # hash map; keys = unique_id (chromosome:start-base, eg 2:123456); vals =
    # @vep_field vals (chr, start, end, etc); acts as ref data for merging with
    # VEP script output
	my $exon_data_ref = shift; $self->_debug(['exon_data_ref',$exon_data_ref]);

    # var: $vep_input
    # full path to vep input file
	my $vep_input_filename = $self->vep_input; # warn $vep_input_filename;

    # var: @results
    # container to hold tab-delimited results array from vep output
	my @results;

=begin # this block handles separate chromosome vep input files:
my @opts = ( "-config=$Bin/../script/vep.ini" );
for my $chr( 1 .. 23, 'X') {
    my $src = join '_', $self->vep_input, $chr;
    next unless -e $src;

    # open pipe to vep script (input = $vep_file; output = stdout):
	my $io = io->pipe("$vep_script @opts -i=$src"); # use IO::Detect; warn is_filehandle($io);
	while ( my $line = $io->getline ) { # warn $line;
		push @results, $line;
	}  $self->_debug(['vep output',\@results]); # p \@results;
}
=cut

    # load opts from vep.ini + user-selected options to augment:
    my @opts = ( "-config=$Bin/../script/vep.ini", "-i=$vep_input_filename" );
    { # enabled forking on deployment server (actually slower on dev; see 'top'):
        my $forks = $self->config->{fork}; # p $forks;
        push @opts, "--fork=$forks" if $forks; # development.ini = 0
    } # p \@opts;

    my $user_opts = $self->_get_form_options;
    push @opts, @$user_opts; # p \@opts;
#	p io('/tmp/data.vep')->slurp; # from test scripts

    # open pipe to vep script (input = $vep_file; output = stdout):
	my $io = io->pipe("$vep_script @opts"); # use IO::Detect; warn is_filehandle($io);

    my $t0 = [gettimeofday]; # set t0
# start vep input ==============================================================
    { # wrap loop in capture_stderr block in case vep.pl issues a 'die' statement
        my $err = capture_stderr { # Capture::Tiny
            while ( my $line = $io->getline ) { # warn $line;
                push @results, $line;
            }
        };
        $self->_add_to_errors($err) if $err; # p $err;
    } $self->_debug(['vep output',\@results]); # p \@results;
# end vep input ================================================================

    $self->result_counter->{runtime} = tv_interval $t0, [gettimeofday]; # script runtime

    # var: @input_fields
    # list of data items from $exon_data_ref which are needed for adding to VEP output
    my @input_fields = qw(exon_id chromosome start_point end_point);

    # var: @vep_data
    # array of composite of vep input & vep output data
    my @vep_data;

    # var: %seen
    # running tally of vep inputs with 1 or more results
    my %seen;

    my $transcripts = $self->transcripts; # p $transcripts;

    RESULT: for (@results) { # p $_;
        next RESULT if /^#/; # skip comment lines
        my @cols = split /\t/; # p @cols;

        # var: $vep_exon_id
        # unique identifier - 1st col of vep output - in same format as
        # %$exon_data_ref key (eg 1_123456_C/T)
        my $vep_exon_id = $cols[$VEP_COLS{exon_id}]; # warn $vep_exon_id;

        # var: $ref
        # $exon_data_ref hashref for $vep_exon_id (start, end, chr, allele, etc)
        if ( my $ref = $exon_data_ref->{$vep_exon_id} ) { # p ['ref',$ref];
            $seen{$vep_exon_id}++; # vep input file has at least one result

            # get exon_id, chromosome, start_point & end_point vals into array:
            my @vars = map $ref->{$_}, @input_fields;

            # add allele, gene, feature, consequence, cds, protein, aa & existing
            # variation vars from vep row:
            push @vars, $cols[$VEP_COLS{$_}] for
                qw(allele gene feature consequence cds protein aa variation);

			# extract sift, polyphen, cds & protein vals from 'extra' col data:
            my $extra_data = $self->_get_extra_params( $cols[$VEP_COLS{extra}] );
            # add to @vars:
            push @vars, $extra_data->{$_} for qw(sift polyphen extra);

            # combine vep input data (chr, start, end, exon_id) with vep output
            # data (sift, polyphen data, consequence, allele, etc):
            push @vep_data, \@vars;

			{ # add consequence, existing variation, sift, polyphen & exon_id to
                # %exon_data_ref{$vep_exon_id}:
				my %h = ( exon_id => $ref->{exon_id} );

                { # add cds & protein if gene/feature combination in $transcripts:
                    my ($gene, $feature) = @cols[@VEP_COLS{'gene','feature'}];
                    my $transcripts_gene_feature = $transcripts->{$gene}
                    || $self->_add_to_errors($gene . ' not in transcripts db');

                    no warnings 'uninitialized'; # in case ! $transcripts_gene_feature
                    if ( $transcripts_gene_feature eq $feature ) {
                        # add consequence, variation, gene & feature from vep data:
                        $h{$_} = $cols[$VEP_COLS{$_}]
                            for qw(consequence variation);
                        # add sift, polyphen, cds & protein vars from vep 'extra' col:
                        $h{$_} = $extra_data->{$_}
                            for qw(sift polyphen cds protein);
                    } # p \%h;
                } # p \%h;
				{ # check location infos match before merging:
					my $vep_location = $cols[$VEP_COLS{location}]; # warn $vep_location;
                    my ($start, $end) = map $ref->{$_}, qw(start_point end_point);
                    my $loc = do {
                           if ( $end < $start ) { join '-', $end, $start } # insertion
                        elsif ( $end > $start ) { join '-', $start, $end } # deletion
                        else  { $start } # substitution
                    }; # warn $loc;

					my $ref_location = sprintf '%s:%s', $ref->{chromosome}, $loc;
                        # p [$vep_location,$ref_location];

					unless ( $vep_location eq $ref_location ) {
                        my $str = sprintf 'vep location (%s) != ref_location (%s)',
                            $vep_location, $ref_location;
                        $self->_add_to_errors($str); # warn $str;
                    }
				}
				# merge %h with $exon_data_ref entry for $vep_exon_id:
                map $exon_data_ref->{$vep_exon_id}->{$_} = $h{$_}, keys %h;
			}

			# var: $sample_data
			# sample data hashref from exon_data_ref for chr_location
			if ( my $sample_data = $ref->{sample_data} ) { # arrayref
				# examines vep consequence & existing_variation cols to determine
				# whether to use vep result for analysis of sample_data
				my $is_required = _want_vep_result(\@cols); # returns 1 or 0

				for (@$sample_data) {
					$_->{vep_row_count}++; # for vep_exon_id freq. in VEP results
					# flag to indicate whether to use result (multiple overwrites OK):
					$_->{use_vep_result} = $is_required; # 1 or 0
				}
			} # p $ref->{sample_data};
		}
        else { warn "no match in src file for $_" } # no match in src file
    } $self->_debug(['post VEP exon_data_ref', $exon_data_ref]);

    # has no vep result:
    my @orphans = sort by_chr_location
        map $exon_data_ref->{$_}, grep { ! $seen{$_} } keys %$exon_data_ref;

	# var: $samples
	# process sample_data in $exon_data_ref
	my $samples = $self->_process_sample_results($exon_data_ref); # returns hashref

	return {
		exon_ref => $exon_data_ref,
		vep_data => \@vep_data,
		non_cds  => \@orphans,
		samples  => $samples,
	}
}

#-------------------------------------------------------------------------------
# get sift & polyphen data from 'extra' col - also gets cds & protein annotations:
sub _get_extra_params {
	my ($self, $str) = @_; # warn $str;
    chomp $str; # warn $str;

    # regex to capture prediction and/or score eg SIFT=deleterious(0.02);
    #  PolyPhen=probably_damaging; SIFT=0.002: / (\w+)? ( \(? [\d\.]+ \)? )? /x
    my $re = qr/
        (\w+)?  # optional 'prediction' (eg benign, deleterious, etc)
        (       # start of optional 'score'
            \(?     # optional open bracket (only if prediction AND score selected)
            [\d\.]+ # any number of 0-9 & decimal point chars (eg 0.012)
            \)?     # optional close bracket (only if prediction AND score selected)
        )?      # end of optional 'score'
    /x;

	my %h; # capture sift, polyphen, cds & protein entries
	if ( $str =~ /sift=($re)/i ) {
		$h{sift} = $1; # warn $h{sift};
	}
	if ( $str =~ /polyphen=($re)/i ) {
		$h{polyphen} = $1; # warn $h{polyphen};
	} # p \%h;

	map { # remove captured items within $re from $str:
		my $quoted = quotemeta $_; # warn $quoted;
		$str =~ s/(sift|polyphen)=$quoted(;?)//i; # remove trailing semi-colon
	} grep $_, @h{ qw(sift polyphen) }; # warn $str; # !! grep $_ or empty vals cause problems

    # capture cds & protein vars - MUST happen after sift & polyphen entries removed:
    if ( my @parts = split ';', $str ) { # get cds & protein from $str:
    # STRAND=1;HGVSc=ENST00000305737.2:c.2285_2286insC;HGVSp=ENSP00000306705.2:p.Gln764ProfsTer5
        my ($x,$c,$p); # will discard $x both times:
        if ( my $str = $parts[1] ) { # 2nd item, split on colon:
            ($x,$c) = split ':', $str;
        }
        if ( my $str = $parts[2] ) { # 3rd item, split on colon:
            ($x,$p) = split ':', $parts[2]; # 3rd item, split on colon
        }
        @h{'cds','protein'} = ($c,$p);
    } # p \%h;

    $h{extra} = $str; # p \%h;
	return \%h;
}


#-------------------------------------------------------------------------------
sub _process_sample_results {
	my ($self, $data) = @_; # p $data;

	my %h;
	while ( my($location, $d) = each %$data ) {
		if ( my $sample_data = $d->{sample_data} ) { # p $sample_data;
            # $sample_data is array(ref):
			for my $ref (@$sample_data) { # p $ref;
				# transform $hmds_ref '/' to '_' for automatic tt sorting:
				my $lab_no = join '_', reverse split '/', $ref->{lab_no};
				$ref->{hmds_ref} = $lab_no; # warn $lab_no;

				if ( $ref->{use_vep_result} ) {
					push @{ $h{match}{$lab_no}{$location} }, $ref;
					$self->result_counter->{matches}++;
				}
				else {
					push @{ $h{no_match}{$lab_no}{$location} }, $ref;
					$self->result_counter->{no_match}++;
				}
            }
		}
	} $self->_debug('sorted_sample_results', \%h); # p \%h;
	return \%h;
}

#-------------------------------------------------------------------------------
sub by_hmds_ref {
	my ($r1, $y1) = split '/', $a; # p [$r1, $y1];
	my ($r2, $y2) = split '/', $b; # p [$r2, $y2];

	return $y1 <=> $y2 || $r1 <=> $r2;
}

#-------------------------------------------------------------------------------
sub _want_vep_result { # caller wants 1 or 0 return val (not undef):
	my $cols = shift; # arrayref

	my $consequence = $cols->[6]; # p [ @{$cols}[6,12] ];
	my $variation   = $cols->[12];

    return 1 if (
        $variation =~ /COSM/ # want these regardless of consequence
        # reject if 'synonymous_variant', accept if variation matches /TMP|^-:
        || ( $variation =~ /TMP|^-/ && $consequence ne 'synonymous_variant' )
    );
	return 0; # don't want it

#	return 1 if $variation =~ /COSM/; # want these regardless of consequence
#	return 0 if	$consequence eq 'synonymous_variant'; # don't want these
#	return ( $variation =~ /TMP|^-/ );
}

#-------------------------------------------------------------------------------
sub _get_hmds_refs {
	my ($self, $header) = @_; # p $header; # arrayref of hmds refs (in pairs)

	my $i = 0; # counter
	my @refs;

	# for cols 6 to last take the 1st col of each pair and extract lab_no:
	# $#{$header} = same as (scalar @array - 1)
	for ( @$header[5 .. $#{$header} - 1] ) { # p [$_, $i];
		next if $i++ % 2; # increment then test for modulus 2 - get every other col
		if ( my ($ref) = $_ =~ m!(\d+/\d{2})! ) {
			push @refs, $ref;
		}
		else {
			my $str = sprintf q!cannot find lab number in col header '%s'!, $_;
			$self->_add_to_errors($str);
		}
	} # p \@refs;
	return \@refs;
}

#-------------------------------------------------------------------------------
sub _get_form_options { # user submitted opts (eg sift, polyphen, etc)
    my $self = shift;

    my $params = $self->form_opts; # p $params;

    my $transcript_db = $params->{db}; # refseq, merged, core

    my @opts = ( '--dir=/home/raj/.vep/' . $transcript_db );

    ARG1: # polyphen & sift take args (s, p or b)
    for my $arg( qw/polyphen sift/ ) {
        my $val = $params->{$arg} || next ARG1; # param has a value
        push @opts, sprintf '--%s=%s', $arg, $val;
    }

    ARG2: # hgvs, check_existing, regulatory & coding_only take no args:
    for my $arg( qw/check_existing coding_only regulatory hgvs/ ) {
        $params->{$arg} || next ARG2; # param = 1 or 0
        push @opts, sprintf '--%s', $arg;
    } # p \@opts;

    return \@opts;
}

#-------------------------------------------------------------------------------
sub _add_to_errors {
	my ($self, $str) = @_;
	push @{ $self->errors }, $str;
}

#-------------------------------------------------------------------------------
sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point
    return ( # 1st sort on chromosome, then start_point:
        ( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ )
            ? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X
            : $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22
    ) || $a->{start_point} <=> $b->{start_point};
}

#-------------------------------------------------------------------------------
sub by_chromosome_and_start_pos { # Sort::Naturally ncmp function - same as above:
    return ncmp($a->[0], $b->[0]) || ncmp($a->[1], $b->[1]);
}

#-------------------------------------------------------------------------------
# extract tab-delimited data from csv source file:
sub _text_csv_slurp {
    my $self = shift;

    my $data = $self->data_file->content; # warn $data; # string

    my $csv = Text::CSV->new(\%text_csv_args);

    my @results;
    for ( split "\n", $data ) { # warn $_;
        if ( $csv->parse($_) ) {
            my @row = $csv->fields();
            push @results, \@row;
        }
    } # p \@results;
    return \@results;
}

#-------------------------------------------------------------------------------
# get reference table as hashref of gene => { chromosome, start_base }:
sub _build_ref_table { NGS::DB->new->ref_table }

#-------------------------------------------------------------------------------
sub _build_known_locations { NGS::DB->new->known_locations }

#-------------------------------------------------------------------------------
sub _build_transcripts { NGS::DB->new->transcripts }

#-------------------------------------------------------------------------------
sub _build_vep_input {
	my $self = shift;

	# create new temp file for vep input data:
	my $filename = $self->form_opts->{data_src}; # p $filename;

=begin # fix for test_scripts if doing:
 $mech->field('data_src', $Bin .'/data/myseq.zip');
 when data_src = /full/path/to/filename (expects just filename):

    if ($ENV{HARNESS_ACTIVE}) { # p %ENV;
        $filename = io($filename)->filename; # p $filename;
    }
=cut
	$filename =~ s/(txt|zip|vcf)\Z/vep/; # .txt -> .vep
	$filename =~ s/\s/_/g; # spaces -> underscores

    my $path_to_file = '/tmp/'.$filename;

    # delete if already exists (eg due to aborted previous run) as using append mode:
    io($path_to_file)->unlink if -e $path_to_file;
	return $path_to_file;
}

#-------------------------------------------------------------------------------
sub _build_result_counter {
	my $self = shift;

    my %h = map +($_ => 0),
        qw( matches no_match vep_input_rows src_rows accepted_src_rows runtime );
	return \%h;
}

#-------------------------------------------------------------------------------
sub _debug { # only debugging dev server output (env added in post => '/vep' sub):
	return 0 unless DEBUG && shift->config->{environment} eq 'development';
	print $debug ddp @_;
}

=begin _build_ref_table() from ref.csv
    my $ref = "$Bin/../ref.csv";
    my $io = new IO::File;
    open( $io, '<', $ref ) || die $!;

    my $csv = Text::CSV->new(\%text_csv_args);

    my %h;
    while ( my $row = $csv->getline( $io ) ) {
        my ($ref_seq, $chr, $start) = @$row;
        $h{$ref_seq} = {
            chromosome => $chr,
            start_base => $start,
        };
    } # p \%h;

    return \%h;
=cut

1;