P3X GitList

package NGS::VEP;

# wrapper around Ensembl Variant Effect Predictor stand-alone script;
# http://www.ensembl.org/info/docs/variation/vep/vep_script.html

use Moo;
use IO::All;
use IO::File;
use Text::CSV;
use Data::Dumper;
use FindBin qw($Bin); # warn $Bin;

use MooX::Types::MooseLike::Base qw(:all);
use NGS::MooX::Types qw(HashReference);
use NGS::DB;

# required to be passed on object contruction:
has src_data  => ( is  => 'ro', isa => Str, required => 1 );
has form_opts => ( is  => 'ro', isa => HashReference, required => 1 );

# called internally by result():
has ref_table => ( is => 'lazy' ); # _build_ref_table()

my %text_csv_args = ( sep_char => "\t", binary => 1 ); # global args for Text::CSV

# location of variable_effect_predictor script:
my $vep_script = "$Bin/../script/variant_effect_predictor.pl";

#===============================================================================
sub result {
    my $self = shift; # warn Dumper $self->database;
    
    my $data = $self->_text_csv_slurp(); # warn Dumper $data; # arrayref

	# remove 1st row (headers):
	my $header_row = shift @$data; # warn Dumper $header_row;
	
	# get hmds refs - in cols starting at col #6:
	my $hmds_refs = _get_hmds_refs($header_row);
	
    # get reference table (chromosome start & end points for sequences):
    my $ref_table = $self->ref_table; # warn Dumper $ref_table;
    
    # create vep file for variant_effect_predictor.pl
    my $vep_file = "$Bin/../results.vep";
    io($vep_file)->unlink if -e ($vep_file); # clear previous

    my @vep_fields = qw(chromosome start_point end_point allele strand exon);
    my @vep_input; # will hold pre-processed data = VEP script input
    my %vep_data;  # will hold ref data for merging with VEP script output
	my %hmds_ref;  # will hold read data (% positive and read number) for samples
    my $i;         # will hold total count of 'accepted' rows

    ROW: for my $row (@$data) { # cols = exon name, base/variant, status, ....
        next ROW unless lc $row->[2] eq 'accepted'; # 3rd col is status field
        $i++; # increment 'accepted' row counter

        my $exon = $row->[0]; # warn $exon;

        # get data from reference table, or skip record:
        my $ref = $ref_table->{$exon}; # warn Dumper $ref;
        if (! $ref) {
            warn "no ref table entry for $exon"; next ROW;
        }
        my $chromosome = $ref->{chromosome};
        
        # split 2nd col to get location & variant (eg 136:A/T, 26-27:TG/--)
        my ($location, $allele) = split ':', $row->[1]; # warn Dumper [$location, $allele];
        
        # define initial start & end point as ref table start position minus 1:
        my $start_point = my $end_point = $ref->{start_base} - 1; # eg 125436276

        # if location > 1 base (eg 26-27, 134-137, etc):
        if ( $location =~ /(\d+)-(\d+)/ ) { # warn $location; # eg 26-27
			my $span = $2 - $1; # eg 26-27 will give span = 1
            $start_point += $1; # ref tbl start pos + 1st capture (26 in above example)
            $end_point = $start_point + $span; 
        }
        else {
            $end_point = $start_point += $location;
        }
        
        { # manipulate exon name to provide unique ID for vep input
          # eg TET2 exon 4 [891], TET2 exon 4 [1031], etc:
            $exon =~ s/\s/%%/g; # vep can't deal with spaces in any col
            $exon .= '%%[' . $location . ']'; # exon + location gives unique ID
        }
        { # data structure for vep script input file: 
            my %h = (
                chromosome  => $chromosome,
                start_point => $start_point,
                end_point   => $end_point,
                allele      => $allele, # eg T/C, A/G, etc
                strand      => '+', # required 5th col in VEP file
                exon        => $exon, # unique identifier eg "TET2 exon 4 [891]"
            ); # warn Dumper \%h;
            push @vep_input, \%h;
            
            { # data for use with vep output (use chr:start_location as unique ID):
              # vep returns eg 2:12345 or 2:12345-12347 in 'Location' col
                my $base_num = ( $start_point == $end_point )
                    ? $start_point # eg 2:12345
                    : $start_point .'-'. $end_point; # eg 2:12345-12347    
                my $unique_id = join ':', $chromosome, $base_num;
                $vep_data{$unique_id} = \%h;
				
				# get patient results in cols 6 onwards:
				my $n = 0; # reset counter to 0
				for my $ref ( @$hmds_refs ) { # warn $ref;
					# result in 1st col of pair, read number in 2nd:
					my $result = $row->[5 + $n];
					if ( $result > 0 ) { # skip zero's
						my $read_no = $row->[5 + $n + 1]; # next col
						# add to hmds ref
						$hmds_ref{$ref}{$unique_id} = {
							result => $result,
							read   => $read_no,
						};
					}
					$n += 2; # move 2 cols along
				}
            }
        }
    } # warn Dumper \%vep_data; # warn Dumper \%hmds_ref;

	# sort data into chromosome order to (possibly) speed up vep processing:
	for ( sort by_chr_location @vep_input ) {
        my $line = join "\t", @{$_}{@vep_fields};
        io($vep_file)->append($line . "\n");
	}
	
	{ # send to function running VEP script:
		my %data = ( vep_data => \%vep_data, sample_data => \%hmds_ref );
		my $result = $self->_run_vep(\%data); # hashref of data & orphaned rows    
		$result->{row_count} = $i; # add row count of accepted rows 
		return $result;
	}
}

#-------------------------------------------------------------------------------
sub _run_vep {
    my ($self, $data) = @_; 
    
	my $src_data = $data->{vep_data}; # "chr:start" => (start_point, end_point, allele, etc)
	my $hmds_ref = $data->{sample_data}; # hmds_ref => { unique_id => { read results } }

	my @results; # to hold tab-delimited results array from vep output

    # load opts from vep.ini + user-selected options to augment:
    my @opts = ( "-config=$Bin/../script/vep.ini" );
    my $user_opts = $self->_get_form_options;
    push @opts, @$user_opts;
    
    # open pipe to vep script (input = $vep_file; output = stdout):
	open my $fh, "-|", $vep_script, @opts;
	while ( my $line = readline($fh) ) { # warn $line; 
		push @results, $line;
	} # warn Dumper \@results;
    
    my @input_fields = qw(exon chromosome start_point end_point allele);
    
    my @data; # to hold composite of vep input & vep output
    my %seen; # running tally of vep inputs with 1 or more results

    for (@results) {
        next if /^#/; # skip comment lines
        my @cols = split /\t/;
        
        my $chr_location = $cols[1]; # in same format as %$src_data key (eg 2:123456)
        
        if ( my $ref = $src_data->{$chr_location} ) { # warn Dumper $ref;
            # capture vep cols: consequence, amino_acids, existing_variation, extra:
			my $extra = $cols[13]; # contains sift, polyphen, etc results
			# extract sift & polyphen into separate cols:
			my $results = $self->_prediction_and_score($extra);

            my @output_data = ( @cols[6,10,12], @$results ); 
            push @data, [ @{$ref}{@input_fields}, @output_data ]; # combine data
            $seen{$chr_location}++; # vep input file has at least one result
			
			# add vep data to $hmds_ref data if it's useful:
			if ( _want_vep_result(\@cols) ) { # examines consequence & existing cols
				REF: while ( my ($hmds, $d) = each %$hmds_ref ) {
					ID: while ( my ($unique_id, $result) = each %$d ) {
						next ID unless $unique_id eq $chr_location;

						my %h = (
							consequence => $cols[6],
							existing    => $cols[12],
							polyphen    => $results->[1],
							sift 		=> $results->[0],
							exon 		=> $ref->{exon},
						);
						
						# merge with $result:
						map $result->{$_} = $h{$_}, keys %h;
					}
				}				
			}
		}
        else { warn "no match in src file for $_" } # no match in src file
    }
    
    # has no vep result:
    my @orphans = sort by_chr_location
        map $src_data->{$_}, grep { ! $seen{$_} } keys %$src_data;
    
	my $samples = _sort_sample_results($hmds_ref); # hashref

    return { data => \@data, orphans => \@orphans, samples => $samples }
}

#-------------------------------------------------------------------------------
sub _prediction_and_score {
	my ($self, $str) = @_; # warn $str;
	
    # regex to capture prediction and/or score eg SIFT=deleterious(0.02);
    #  PolyPhen=probably_damaging; SIFT=0.002: / (\w+)? ( \(? [\d\.]+ \)? )? /x
    my $re = qr/ 
        (\w+)?  # optional 'prediction' (eg benign, deleterious, etc)
        (       # start of optional 'score'
            \(?     # optional open bracket (only if prediction AND score selected)
            [\d\.]+ # any number of 0-9 & decimal point chars (eg 0.012)
            \)?     # optional close bracket (only if prediction AND score selected) 
        )?      # end of optional 'score'
    /x;

	my @results; # capture in order 'sift', 'polyphen' to match expected order in tt:
	if ( $str =~ /sift=($re)/i ) {
		push @results, $1; # warn $sift;
	}
	if ( $str =~ /polyphen=($re)/i ) { 
		push @results, $1; # warn $poly;
	}

=begin # combined, but doesn't work - gets killed	
	while ( $str =~ /(?:sift|polyphen)=($re)/ig ) { # don't capture words
		push @results, $1; # warn $1;
	}
=cut
	map { # remove captured items within $re from $str:
		my $quoted = quotemeta $_;
		$str =~ s/(sift|polyphen)=$quoted(;?)//i; # remove trailing semi-colon
	} @results; # warn $str;

	push @results, $str; # what's left of it (non-sift, non-polyphen 'extra')
	return \@results;
}

#-------------------------------------------------------------------------------
sub _sort_sample_results {
	my $data = shift; # hashref of hmds_ref => { chr:location => data }
	
	my %h;
	
	REF: while ( my ($hmds_ref, $d ) = each %$data ) { # warn Dumper $d;
		LOC: while ( my ($location, $results) = each %$d ) {
			my $exon_name = $results->{exon} || next LOC;
			# transform $hmds_ref slash to underscore for automatic tt sorting:
			my $lab_no = join '_', reverse split '/', $hmds_ref; # warn $lab_no;
			# add hmds_ref to $result:
			$results->{hmds_ref} = $hmds_ref;
			$h{$lab_no}{$exon_name} = $results;
		}
	} # warn Dumper \%h;
	return \%h;
}

#-------------------------------------------------------------------------------
sub by_hmds_ref {
	my ($r1, $y1) = split '/', $a; # warn Dumper [$r1, $y1];
	my ($r2, $y2) = split '/', $b; # warn Dumper [$r2, $y2];
	
	return $y1 <=> $y2 || $r1 <=> $r2;
}

#-------------------------------------------------------------------------------
sub _want_vep_result {
	my $cols = shift; # arrayref
	
	my $consequence = $cols->[6];
	my $existing_variation = $cols->[12];
	
	return 1 if
		$consequence ne 'synonymous_variant' ||
		$existing_variation eq '-' 			 ||
		$existing_variation =~ /COSM|TMP/;

	return 0; # don't want VEP result
}

#-------------------------------------------------------------------------------
sub _get_hmds_refs {
	my $header = shift; # arrayref of hmds refs (in pairs)
	
	my $i = 0; # counter
	my @refs;

	for ( @$header[5 .. $#{$header} - 1] ) {
		next unless $i++ % 2; # increment then test for modulus 2 - get every other col

		my ($ref) = $_ =~ m!(\d+/\d{2})!;
		push @refs, $ref;
	} # warn Dumper \@refs;
	return \@refs;
}

#-------------------------------------------------------------------------------
sub _get_form_options { # user submitted opts (eg sift, polyphen, etc)
    my $self = shift;

    my $params = $self->form_opts; # warn Dumper $form_opts;
    
    my @opts;
    
    # polyphen & sift take args (s, p or b)
    push @opts, '--polyphen=' . $params->{polyphen} if $params->{polyphen};
    push @opts, '--sift='     . $params->{sift} if $params->{sift};
    # check_existing, regulatory & coding_only take no args:
    for ( qw/check_existing coding_only regulatory/ ) {
        push @opts, "--$_" if $params->{$_};
    } # warn Dumper \@opts;
    
    return \@opts;
}

#-------------------------------------------------------------------------------
sub by_chr_location { # alphanumeric sort on chromosome 1-22, X, and start_point
    return ( # 1st sort on chromosome, then start_point:
        ( $a->{chromosome} !~ /^\d+/ || $b->{chromosome} !~ /^\d+/ )
            ? $a->{chromosome} cmp $b->{chromosome} # non-digit eg X
            : $a->{chromosome} <=> $b->{chromosome} # digit eg 1 - 22
    ) || $a->{start_point} <=> $b->{start_point};
}

#-------------------------------------------------------------------------------
# extract tab-delimited data from csv source file:
sub _text_csv_slurp {
    my $self = shift;
    
    my $data = $self->src_data; # string

    my $csv = Text::CSV->new(\%text_csv_args);
    
    my @results;
    for ( split "\n", $data ) { # warn $_;
        if ( $csv->parse($_) ) {
            my @row = $csv->fields();
            push @results, \@row;
        }
    }
    return \@results;
}

#-------------------------------------------------------------------------------
# get reference table as hashref of gene => { chromosome, start_base }:
sub _build_ref_table {
    my $self = shift;
    
    my $db = NGS::DB->new(); # warn $db->dbix;
    my $ref_tbl = $db->dbix->select('ref_seq', '*')
        ->map_hashes('gene'); # warn Dumper $ref_tbl;
    return $ref_tbl;
}
 
=begin _build_ref_table() from ref.csv
    my $ref = "$Bin/../ref.csv";
    my $io = new IO::File;
    open( $io, '<', $ref ) || die $!;

    my $csv = Text::CSV->new(\%text_csv_args);

    my %h;
    while ( my $row = $csv->getline( $io ) ) {
        my ($ref_seq, $chr, $start) = @$row;
        $h{$ref_seq} = {
            chromosome => $chr,
            start_base => $start,
        };
    } # warn Dumper \%h;
    
    return \%h;
=cut

1;