#!/usr/bin/env perl
=begin
generates vep input file from myseq vcf src files for single myseq run
=cut
BEGIN {
use lib '/home/raj/perl5/lib/perl5'; # ensure FindBin from local-lib
use FindBin; # warn $FindBin::Bin; exit;
use lib (
$FindBin::Bin . '/../../lib',
$FindBin::Bin . '/../..', # for ngs.sqlite db
);
}
use Data::Dumper::Concise;
use Modern::Perl '2012';
use IO::All;
use NGS::DB;
my $db = NGS::DB->new(); # warn $db->dbix;
my $gene_strand = $db->dbix->select('gene_strand', [qw/gene_name strand/])->map;
my $hotspots = $db->dbix->select('hotspots', ['position', 1])->map;
# warn Dumper [$hotspots, $gene_strand]; exit;
my $src_dir = $FindBin::Bin . '/src';
my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs;
my $a = my $b = 0;
my %samples;
# collect filenames from each dir, add to %samples hash:
for my $d (@dirs) { # warn Dumper $_;
next unless $d->type eq 'dir'; # skip eg png's
my @files = io($d)->all; # Dumper \@files;
for my $f (@files) { # warn $f->filename;
my $filename = $f->filename;
next unless $filename =~ /vcf\Z/; # warn $f->filename;
push @{ $samples{$filename} }, $f; # io object
}
} # warn Dumper keys %samples; exit;
my @vep;
FILE:
while ( my($sample, $o) = each %samples ) {
# parse file content in each directory:
for my $f(@$o) { # warn $f->name;
my $data = parse($f); # warn Dumper $data; # arrayref
push @vep, @$data;
} warn Dumper @vep;
}
warn $a;
warn $b;
sub parse {
my $f = shift; # io object
my @folders = $f->splitdir; # warn $folders[-2]; # array of dirs in full path
my @lines = io($f->name)->slurp; # warn Dumper \@lines;
my @data = ();
LINE: for (@lines) {
next LINE if /^#/; # skip comments
my @fields = split "\t", $_;
my $start_pos = $fields[1] || next LINE; # warn $start_pos;
{
my $vals = _parse_fields(\@fields) || next LINE; # hashref
# add sample name:
$vals->{sample} = $folders[-1]; # warn $folders[-1];
push @data, $vals;
}
}
return \@data;
}
sub _parse_fields {
my $data = shift; # arrayref of tab-separated vcf row entries
my $chr = $data->[0]; # warn $chr;
my $pos = $data->[1]; # warn $pos;
my $ref = $data->[3]; # warn $ref;
my $alt = $data->[4]; # warn $alt;
my $filter = $data->[6]; # warn $filter;
my $info = $data->[7]; # warn $info
my $results = $data->[9]; # warn $results;
my $vf = (split ':', $results)[3]; # warn $vf;
my ($depth) = $info =~ /DP=(\d+)/; # warn $depth;
my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match
my ($chromosome) = $chr =~ /chr([\w\d]+)/; # warn Dumper [$chr, $chromosome];
my @genes = do { # remove duplicate genes: eg TP53,TP53,TP53
no warnings 'uninitialized'; # eg if regex failed due to malformed str
my %seen = map { $_ => 1 } split ',', $genes;
keys %seen;
} || warn 'no GI entry'; # warn @genes;
#$a++;
# skip unless VF > 5% cutoff or start position is a known hotspot:
return 0 unless $vf > 0.05 || $hotspots->{$pos};
#$b++;
my $start_pos = my $end_pos = $pos;
my $allele;
# insertion:
if ( length $alt > 1 ) {
$start_pos += 1;
my @bases = split //, $alt; # warn Dumper \@bases;
shift @bases;
$allele = sprintf '-/%s', join '', @bases; # warn Dumper [$alt,$allele];
}
# deletion:
elsif ( length $ref > 1 ) {
$start_pos += 1;
$end_pos += ( (length $ref) - 1 );
my @bases = split //, $ref; # warn Dumper \@bases;
shift @bases;
$allele = sprintf '%s/-', join '', @bases; # warn Dumper [$ref,$allele];
} # warn Dumper [$pos,$alt,$ref,$start_pos,$end_pos];
# substitution:
$allele ||= sprintf '%s/%s', $ref, $alt; # warn $allele;
my %h = (
chromosome => $chromosome,
start_pos => $start_pos,
end_pos => $end_pos,
allele => $allele,
strand => '+', # don't need to use gene_strand table
);
=begin # not splitting genes, not using gene_strand table
GENE:
for my $gene (@genes) {
# + or - strand from gene_strand table:
my $strand = $gene_strand->{$gene};
unless ($strand) {
warn "$gene not in gene_strand table";
next GENE;
} # warn Dumper [ $chromosome, $start_pos, $end_pos, $allele, $strand ];
$h{strand} = $strand;
}
=cut
return \%h;
}