#!/usr/bin/env perl
# analyses individual samples supplied by argv[0] across multiple directories
use lib '/home/raj/perl5/lib/perl5';
use Modern::Perl '2012';
use Data::Dumper;
use FindBin; # warn $FindBin::Bin;
use IO::All;
use lib $FindBin::Bin . '/../../lib';
use NGS::ChartMaker;
my $src = $ARGV[0];
unless ($src) {
print "need a vcf file identifier !!\n\n"; exit;
}
my $src_dir = $FindBin::Bin . '/src';
my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs;
my %h; # start-position indexer
# data structure for chartdirector:
my %chart = ( data_title => $src );
# extract files from each dir, skip all except matching supplied filename:
for my $d (@dirs) { # warn Dumper $_;
next unless $d->type eq 'dir'; # skip eg png's
my @files = io($d)->all; # Dumper \@files;
for my $f (@files) { # warn $f->filename;
next unless $f->filename =~ /^$src/; # warn $f->filename;
parse($f->name);
}
} # warn Dumper \%h; exit;
# printf "| %9s | %s |\n", $_, $h{$_}{count} for sort by_frequency keys %h;
# map print Dumper $h{$_}, grep { $h{$_}{count} == 1 } sort by_start_pos keys %h;
my $format = "| %9s | %6s | %8s | %10s | %4s | %6s |\n";
my @dividr = ( '=' x 9, '=' x 6, '=' x 8, '=' x 10, '=' x 4, '=' x 6 );
printf $format, qw( START REF ALT FILTER DP VF );
printf $format, @dividr;
for my $location ( sort by_start_pos keys %h ) { # warn $location;
next unless $h{$location}{count} == 1;
my $data = $h{$location}{data}; # warn Dumper $data;
for (@$data) {
push @{ $chart{data}{x} }, $_->{depth};
push @{ $chart{data}{y} }, $_->{vf};
}
printf $format, $location, @{$_}{ qw/ref alt filter depth vf/ } for @$data;
printf $format, @dividr;
}
# generate chart of depth (x) vs variable frequency (y):
@chart{ qw/x_title y_title chart_title/ } = ( 'Depth', 'VF', 'VCF analysis' );
my $c = NGS::ChartMaker->new(%chart);
$c->make_chart();
exit 0;
#===============================================================================
sub parse {
my $filename = shift;
my @lines = io($filename)->slurp; # warn Dumper \@lines;
my %local = (); # indexer localised to individual file
LINE: for (@lines) {
next LINE if /^#/; # skip comments
my @fields = split "\t", $_;
my $start_pos = $fields[1] || next LINE; # warn $start_pos;
# initialise intra-file start position indexer:
$local{$start_pos} ||= 1; # only need true/false, not item frequency
{ # add data as array(ref) - can have multiple start pos entries per file:
my $data = _parse_fields(\@fields);
push @{ $h{$start_pos}{data} }, $data;
}
}
# auto-increment %h start position key for each start position key in %i:
for my $start_position (keys %local) { $h{$start_position}{count}++ }
}
sub by_frequency { # 1st by frequency, 2nd by start position:
return $h{$b}{count} <=> $h{$a}{count} || $a <=> $b;
}
sub by_start_pos {
return $a <=> $b;
}
sub _parse_fields {
my $data = shift; # arrayref of tab-separated vcf row entries
my ($ref,$alt) = @{$data}[3,4]; # warn Dumper [$ref,$alt];
my $filter = $data->[6]; # warn $filter;
my ($depth) = $data->[7] =~ /DP=(\d+)/; # warn $depth;
my $results = $data->[-1]; # warn $results;
my $vf = (split ':', $results)[3]; # warn $vf;
my %h = (
filter => $filter,
depth => $depth,
ref => $ref,
alt => $alt,
vf => $vf,
);
return \%h;
}