#!/usr/bin/env perl
# analyses multiple directories of patient samples - produces 1 table + 1 graph for each
use lib '/home/raj/perl5/lib/perl5';
use Modern::Perl '2012';
use Data::Dumper::Concise;
use File::Path qw(make_path remove_tree);
use FindBin; # warn $FindBin::Bin;
use IO::All;
use CGI;
use lib $FindBin::Bin . '/../../lib';
use NGS::ChartMaker;
my $src_dir = $FindBin::Bin . '/src';
my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs;
my %samples;
# collect filenames from each dir, add to %samples hash:
for my $d (@dirs) { # warn Dumper $_;
next unless $d->type eq 'dir'; # skip eg png's
my @files = io($d)->all; # Dumper \@files;
for my $f (@files) { # warn $f->filename;
my $filename = $f->filename;
next unless $filename =~ /vcf\Z/; # warn $f->filename;
push @{ $samples{$filename} }, $f; # io object
}
} # warn Dumper \%samples; exit;
# delete and recreate analysis output dirs, one for each src dir:
for ( my $i = 1; $i <= @dirs; $i++ ) { # warn $i;
my $dir = $FindBin::Bin . '/analysis/' . $i;
remove_tree($dir) if -e $dir; # not worth the hassle using IO::All
make_path($dir); # ditto
}
my $q = CGI->new;
my %h = (); # start-position indexer (global, used in parse(), reset in FILE loop)
my %chart = (
x_title => 'Depth',
y_title => 'VF',
);
# params to collect from vcf file:
my @params = qw( position vf alt ref gene filter depth );
FILE: # check each file is represented in all dirs:
while ( my($sample, $o) = each %samples ) {
unless ( scalar @$o == scalar @dirs ) { # one object per file per directory
printf "%s seen only %s times\n", $sample, scalar @$o;
next FILE;
}
%h = (); # reset
# parse file content in each directory, %h holds array of data
for my $f(@$o) { # warn $f->name;
parse($f->name);
} # warn Dumper \%h; next;
my %data; # array of all results (href) for this sample:
while ( my($position, $d) = each %h ) {
my ($n, $results ) = ( $d->{count}, $d->{data} );
for my $result(@$results) {
# add position to $result href:
$result->{position} = $position;
push @{ $data{$n} }, $result;
}
}
$sample =~ s/\.vcf//; # warn $sample;
# now have hash of all results from all dirs for this sample, keys = $n (1,2,3,etc)
while ( my($n, $result) = each %data ) {
my @rows = (); # reset
$chart{data} = {}; # reset
# sort each row by variation frequency (highest -> lowest):
for my $row ( sort by_var_freq @$result ) {
push @rows, $q->Tr( $q->td( [ @{$row}{@params} ] ) );
push @{ $chart{data}{x} }, $row->{depth};
push @{ $chart{data}{y} }, $row->{vf};
} # warn Dumper \@rows; exit;
my $file = sprintf './analysis/%s/%s.html', $n, $sample;
my $content = _template($sample, \@rows);
$content > io($file);
$chart{chart_title} = $sample;
$chart{data_title} = sprintf '%s/%s', $n, $sample;
NGS::ChartMaker->new(%chart)->make_chart();
}
}
exit 0;
#===============================================================================
sub parse {
my $filename = shift;
my @lines = io($filename)->slurp; # warn Dumper \@lines;
my %local = (); # indexer localised to individual file
LINE: for (@lines) {
next LINE if /^#/; # skip comments
my @fields = split "\t", $_;
my $start_pos = $fields[1] || next LINE; # warn $start_pos;
# initialise intra-file start position indexer:
$local{$start_pos} ||= 1; # only need true/false, not item frequency
{ # add data as array(ref) - can have multiple start pos entries per file:
my $data = _parse_fields(\@fields);
push @{ $h{$start_pos}{data} }, $data;
}
}
# auto-increment %h start position key for each start position key in %i:
for my $start_position (keys %local) { $h{$start_position}{count}++ }
}
sub _template {
my ($sample, $rows) = @_; # str, arrayref
return sprintf <<"HTML", join "\n", @$rows;
<!DOCTYPE html>
<html lang="en">
<head>
<style>
html { font-family: verdana, sans-serif; }
table { border-collapse: collapse; margin-left: 1em }
td, th { border: solid 1px #c0c0c0; padding: 2px; font-size: 9pt; }
#head { background-color: ActiveCaption; color: CaptionText; }
h2 { color: #800000; }
</style>
</head>
<body>
<h2>${sample}.vcf</h2>
<table>
<tr id="head">
<th>position</th>
<th>vf</th>
<th>alt</th>
<th>ref</th>
<th>gene</th>
<th>filter</th>
<th>depth</th>
<tr>
<tbody>%s</tbody>
</table>
</body>
</html>
HTML
}
sub by_frequency { # 1st by frequency, 2nd by start position:
return $h{$b}{count} <=> $h{$a}{count} || $a <=> $b;
}
sub by_var_freq {
return $b->{vf} <=> $a->{vf} || $a->{depth} <=> $b->{depth};
}
sub by_start_pos {
return $a <=> $b;
}
sub _parse_fields {
my $data = shift; # arrayref of tab-separated vcf row entries
my $ref = $data->[3]; # warn $ref;
my $alt = $data->[4]; # warn $alt;
my $filter = $data->[6]; # warn $filter;
my $info = $data->[7]; # warn $info
my $results = $data->[9]; # warn $results;
my $vf = (split ':', $results)[3]; # warn $vf;
my ($depth) = $info =~ /DP=(\d+)/; # warn $depth;
my ($gene) = $info =~ /GI=(.*?)\;/; # warn $gene # non-greedy match
my %h = (
filter => $filter,
depth => $depth,
gene => $gene,
ref => $ref,
alt => $alt,
vf => $vf,
);
return \%h;
}