P3X GitList

package NGS::VEP;

# wrapper around Ensembl Variant Effect Predictor stand-alone script;
# http://www.ensembl.org/info/docs/variation/vep/vep_script.html

use Moo;
use IO::All;
use IO::File;
use Text::CSV;
use Data::Dumper;
use FindBin qw($Bin); # warn $Bin;

use MooX::Types::MooseLike::Base qw(:all);
use NGS::MooX::Types qw(HashReference);
use NGS::DB;

# required to be passed on object contruction:
has src_data  => ( is  => 'ro', isa => Str, required => 1 );
has form_opts => ( is  => 'ro', isa => HashReference, required => 1 );

# called internally by result():
has ref_table => ( is => 'lazy' ); # _build_ref_table()

my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV

# location of variable_effect_predictor script:
my $vep_script = "$Bin/../script/variant_effect_predictor.pl";

#===============================================================================
sub result {
    my $self = shift; # warn Dumper $self->database;
    
    my $data = $self->_text_csv_slurp(); # warn Dumper $data; # arrayref

    # get reference table:
    my $ref_table = $self->ref_table; # warn Dumper $reference;
    
    # create vep file for variant_effect_predictor.pl
    my $vep_file = "$Bin/../results.vep";
    io($vep_file)->unlink if -e ($vep_file); # clear previous

    my @vep_fields = qw(chromosome start_point end_point allele strand exon);    
    my %vep_data; # will hold ref data for merging with VEP script output
    my $i; # will hold total count of 'accepted' rows

    ROW: for my $row (@$data) { # cols = ref_seq, base/variant, status, ....
        next ROW unless lc $row->[2] eq 'accepted'; # 3rd col is status field
        $i++; # increment 'accepted' row counter

        my $exon = $row->[0]; # warn $exon;

        # get data from reference table, or next record:
        my $ref = $ref_table->{$exon}; # warn Dumper $ref;
        if (! $ref) {
            warn "no ref table entry for $exon"; next ROW;
        }
        my $chromosome = $ref->{chromosome};
        
        # split 2nd col to get location & variant (eg 136:A/T, 26-27:TG/--)
        my ($location, $allele) = split ':', $row->[1]; # warn Dumper [$location, $allele];
        
        # define initial start & end point as ref table start position minus 1:
        my $start_point = my $end_point = $ref->{start_base} - 1;

        # if location > 1 base (eg 26-27, 134-137 , etc):
        if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27
			my $span = $2 - $1; # eg 26-27 will give span = 1
            $start_point += $1; # start location (26 in above example)
            $end_point = $start_point + $span; 
        }
        else {
            $end_point = $start_point += $location;
        }
        
        { # manipulate exon name to provide unique ID for vep input
          # eg TET2 exon 4 [891], TET2 exon 4 [1031], etc:
            $exon =~ s/\s/%%/g; # vep can't deal with spaces in any col
            $exon .= '%%[' . $location . ']'; # exon + location gives unique ID
        }
        { # data structure for vep script input file: 
            my %h = (
                chromosome  => $chromosome,
                start_point => $start_point,
                end_point   => $end_point,
                allele      => $allele,
                strand      => '+', # required 5th col in VEP file
                exon        => $exon, # unique identifier eg "TET2 exon 4 [891]"
            ); # warn Dumper \%h;
            
            # append to vep script input file:
            my $line = join "\t", @h{@vep_fields};
            io($vep_file)->append($line . "\n");
            
            { # data for use with vep output (use chr:start_location as unique ID):
              # vep returns eg 2:12345 or 2:12345-12347 in 'Location' col
                my $base_num = ( $start_point == $end_point )
                    ? $start_point # eg 2:12345
                    : $start_point .'-'. $end_point; # eg 2:12345-12347    
                my $unique_id = join ':', $chromosome, $base_num;
                $vep_data{$unique_id} = \%h;
            }
        }
    } # warn Dumper \%vep_data;
   
    my $result = $self->_run_vep(\%vep_data); # hashref of data & orphaned rows    
    $result->{row_count} = $i; # add row count of accepted rows 
    return $result;
}

#-------------------------------------------------------------------------------
sub _run_vep {
    my ($self, $src_data) = @_; # HoH "chr:start" => (start_point, end_point, allele, etc)
    
	my @results; # to hold tab-delimited results array from vep output

    # load opts from vep.ini + user-selected options to augment:
    my @opts = ( "-config=$Bin/../script/vep.ini" );
    my $user_opts = $self->_get_form_options;
    push @opts, @$user_opts;
    
    # open pipe to vep script (input = $vep_file; output = stdout):
	open my $fh, "-|", $vep_script, @opts;
	while ( my $line = readline($fh) ) { # warn $line; 
		push @results, $line;
	} # warn Dumper \@results;
    
    my @input_fields = qw(exon chromosome start_point end_point allele);
    
    my @data; # to hold composite of vep input & vep output
    my %seen; # running tally of vep inputs with 1 or more results

    for (@results) {
        next if /^#/; # skip comment lines
        my @cols = split /\t/;
        
        my $chr_location = $cols[1]; # in same format as %$src_data key (eg 2:123456)
        
        if ( my $ref = $src_data->{$chr_location} ) {
            # capture vep cols: consequence, amino_acids, existing_variation, extra:
            my @output_data = @cols[6,10,12,13]; 
            push @data, [ @{$ref}{@input_fields}, @output_data ]; # combine data
            $seen{$chr_location}++; # vep input file has at least one result
        }
        else { warn $_ } # no match in src file
    }
    
    # has no vep result:
    my @orphans = sort by_chr_location
        map $src_data->{$_}, grep { ! $seen{$_} } keys %$src_data;
    
    return { data => \@data, orphans => \@orphans }
}

#-------------------------------------------------------------------------------
sub _get_form_options { # user submitted opts (eg sift, polyphen, etc)
    my $self = shift;

    my $params = $self->form_opts; # warn Dumper $form_opts;
    
    my @opts;
    
    # polyphen & sift take args (s, p or b)
    push @opts, '--polyphen=' . $params->{polyphen} if $params->{polyphen};
    push @opts, '--sift='     . $params->{sift} if $params->{sift};
    # check_existing, regulatory & coding_only take no args:
    for ( qw/check_existing coding_only regulatory/ ) {
        push @opts, "--$_" if $params->{$_};
    } # warn Dumper \@opts;
    
    return \@opts;
}

#-------------------------------------------------------------------------------
sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point
    return ( # 1st sort on chromosome, then start_point:
        ( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ )
            ? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X
            : $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22
    ) || $a->{start_point} <=> $b->{start_point};
}

#-------------------------------------------------------------------------------
# extract tab-delimited data from csv source file:
sub _text_csv_slurp {
    my $self = shift;
    
    my $data = $self->src_data; # string

    my $csv = Text::CSV->new(\%text_csv_args);
    
    my @results;
    for ( split "\n", $data ) { # warn $_;
        if ( $csv->parse($_) ) {
            my @row = $csv->fields();
            push @results, \@row;
        }
    }
    return \@results;
}

#-------------------------------------------------------------------------------
# get reference table as hashref of gene => { chromosome, start_base }:
sub _build_ref_table {
    my $self = shift;
    
    my $db = NGS::DB->new(); # warn $db->dbix;
    my $ref_tbl = $db->dbix->select('ref_seq', '*')
        ->map_hashes('gene'); # warn Dumper $ref_tbl;
    return $ref_tbl;
}
 
=begin _build_ref_table() from ref.csv
    my $ref = "$Bin/../ref.csv";
    my $io = new IO::File;
    open( $io, '<', $ref ) || die $!;

    my $csv = Text::CSV->new(\%text_csv_args);

    my %h;
    while ( my $row = $csv->getline( $io ) ) {
        my ($ref_seq, $chr, $start) = @$row;
        $h{$ref_seq} = {
            chromosome => $chr,
            start_base => $start,
        };
    } # warn Dumper \%h;
    
    return \%h;
=cut

1;