#!/usr/bin/env perl
=begin
generates vep input file from MiSeq vcf src files for single MiSeq run
=cut
BEGIN {
use lib '/home/raj/perl5/lib/perl5'; # ensure FindBin from local-lib
use FindBin; # warn $FindBin::Bin; exit;
use lib (
$FindBin::Bin . '/../../lib',
$FindBin::Bin . '/../..', # so NGS::DB can find ngs.sqlite db
);
}
use Data::Dumper::Concise;
use Modern::Perl '2012';
use IO::All;
use NGS::DB;
my $db = NGS::DB->new(); # warn $db->dbix;
my $hotspots = $db->dbix->select('hotspots', ['position', 1])->map;
# my $gene_strand = $db->dbix->select('gene_strand', [qw/gene_name strand/])->map;
# warn Dumper [$hotspots, $gene_strand]; exit;
my $src_dir = $FindBin::Bin . '/src';
my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs;
# cols for vep input file:
my @cols = qw(chromosome start_pos end_pos allele strand sample_id);
# create new vep input file for each MiSeq run (ie dir of vcf files):
for my $d (@dirs) { # warn $d->dir; next;
next unless $d->type eq 'dir'; # skip any files
my @files = io($d)->all; # Dumper \@files;
# extract dir name and path from $d io object:
my ( $miseq_path, $miseq_run ) = ( $d->dir, $d->filename );
my @vep = (); # reset array - to hold rows for vep input file
FILE: for my $f (@files) { # warn $f->filename;
my $vcf_file = $f->filename;
my $sample_id = join '/', $miseq_run, $vcf_file;
next FILE unless $vcf_file =~ /vcf\Z/; # skip unless .vcf suffix
my $data = _parse_file($f); # warn Dumper $data; # AoH
for my $row (@$data) { # warn Dumper $row; # hashref
# add sample ID (needs to be done in this block to get $miseq_run var):
$row->{sample_id} = $sample_id;
# write out vars as tab-delimited row to @vep array:
my $vars = join "\t", @{$row}{@cols};
push @vep, $vars;
}
}
{ # create vep input file in MiSeq src data root dir:
my $content = join "\n", @vep;
my $vep_file = sprintf '%s/../%s.vep', $miseq_path, $miseq_run;
$content > io($vep_file); # warn $vep_file;
}
}
#===============================================================================
sub _parse_file {
my $f = shift; # io object
my @rows = io($f->name)->slurp; # warn Dumper \@rows;
my @data = (); # for AoH
ROW: for (@rows) {
next ROW if /^#/; # skip comments
my @fields = split "\t", $_;
my $vals = _parse_fields(\@fields) || next ROW; # $vals = hashref
push @data, $vals;
}
return \@data;
}
sub _parse_fields {
my $data = shift; # arrayref of tab-separated vcf row entries
return 0 unless $data->[0] =~ /^chr/; # or rest of regexes will fail
my $chr = $data->[0]; # warn $chr; # eg chrX, chr2, etc
my $pos = $data->[1]; # warn $pos; # numerical
my $ref = $data->[3]; # warn $ref; # str of 1 or more A,C,T,G bases
my $alt = $data->[4]; # warn $alt; # str of 1 or more A,C,T,G bases
my $info = $data->[7]; # warn $info # DP, TI, GI, FC, GT, etc values
my $results = $data->[9]; # warn $results; # colon-separated results (eg VF)
my $vf = (split ':', $results)[3] || warn 'no VF info'; # warn $vf;
# skip unless VF > 5% cutoff or start position is a known hotspot:
return 0 unless $vf > 0.05 || $hotspots->{$pos};
my ($depth) = $info =~ /DP=(\d+)/; # warn $depth;
my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match
my $allele = undef; # defined in block below
# define initial start & end positions = vcf 'position' var (2nd col):
my $start_pos = my $end_pos = $pos;
# define allele and refine start & end positions depending on ins/del/sub:
if ( length $alt > 1 ) { # base insertion:
$start_pos += 1; # end pos stays same
$allele = sprintf '-/%s', substr($alt, 1); # alt minus 1st base
}
elsif ( length $ref > 1 ) { # base deletion:
$start_pos += 1;
$end_pos += length($ref) - 1;
$allele = sprintf '%s/-', substr($ref, 1); # ref minus 1st base
}
else { # alt & ref both single ie base substitution:
$allele = sprintf '%s/%s', $ref, $alt; # warn $allele;
}
my %h = (
chromosome => substr($chr, 3), # remove leading 'chr' chars
start_pos => $start_pos,
end_pos => $end_pos,
allele => $allele,
); # warn Dumper \%h;
=begin # not splitting genes, not using gene_strand table
my @genes = do { # remove duplicate genes: eg TP53,TP53,TP53
no warnings 'uninitialized'; # eg if regex failed due to malformed str
my %seen = map { $_ => 1 } split ',', $genes;
keys %seen;
} || warn 'no GI entry'; # warn @genes;
GENE: for my $gene (@genes) {
# + or - strand from gene_strand table:
my $strand = $gene_strand->{$gene};
unless ($strand) {
warn "$gene not in gene_strand table";
next GENE;
} # warn Dumper [ $chromosome, $start_pos, $end_pos, $allele, $strand ];
$h{strand} = $strand;
}
=cut
$h{strand} = '+'; # not using gene_strand table
return \%h;
}