RSS Git Download  Clone
Raw Blame History
#!/usr/bin/perl

# analyses multiple directories of patient samples - produces 1 table + 1 graph for each 

use lib '/home/raj/perl5/lib/perl5';

use Modern::Perl '2012';
use Data::Dumper::Concise;
use File::Path qw(make_path remove_tree);
use FindBin; # warn $FindBin::Bin;
use IO::All;
use CGI;

use lib $FindBin::Bin . '/../../lib';
use NGS::ChartMaker;

my $src_dir = $FindBin::Bin . '/src';
my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs;
    
my %samples;
# collect filenames from each dir, add to %samples hash:
for my $d (@dirs) { # warn Dumper $_;
    next unless $d->type eq 'dir'; # skip eg png's
	my @files = io($d)->all; # Dumper \@files;  
	for my $f (@files) { # warn $f->filename;
		my $filename = $f->filename;
		next unless $filename =~ /vcf\Z/; # warn $f->filename;
		push @{ $samples{$filename} }, $f; # io object
	}
} # warn Dumper \%samples; exit;

# delete and recreate output dirs, one for each src dir:
for ( my $i = 1; $i <= @dirs; $i++ ) { # warn $i;
    my $dir = $FindBin::Bin . '/analysis/' . $i;
    remove_tree($dir);  # not worth the hassle using IO::All   
    make_path($dir); # ditto
}

my %h = (); # start-position indexer
my $q = CGI->new;

my %chart = (
    x_title => 'Depth',
    y_title => 'VF',
);

FILE: # check each file is represented in all dirs:
while ( my($sample, $o) = each %samples ) {
	unless ( scalar @$o == scalar @dirs ) { # one object per file per directory
		printf "%s seen only %s times\n", $sample, scalar @$o;
		next FILE;
	}
	%h = (); # reset
	
	# parse file content in each directory, %h holds array of data
	for my $f(@$o) { # warn $f->name;
		parse($f->name);
	} # warn Dumper \%h;	next;
	
	my %data; # array of all results (href) for this sample:
	while ( my($position, $d) = each %h ) {
		my ($n, $results ) = ( $d->{count}, $d->{data} );

		for my $result(@$results) { 
			# add position to $result href:
			$result->{position} = $position;
			push @{ $data{$n} }, $result;
		}
	}
		
    my $filename = $sample; $filename =~ s/\.vcf//; # warn $filename;
    
    # now have hash of all results from all dirs for this sample, keys = $n (1,2,3, etc)
    while ( my($n, $result) = each %data ) {
        my @rows = ();
        $chart{data} = {}; # reset
        $chart{chart_title} = $filename;
        $chart{data_title}  = sprintf '%s/%s', $n, $filename;

        for my $row ( sort by_var_freq @$result ) {
            push @rows, $q->Tr(
                $q->td( [ @{$row}{ qw /position vf alt ref gene filter depth/ } ] )
            );
            push @{ $chart{data}{x} }, $row->{depth};
            push @{ $chart{data}{y} }, $row->{vf};		

        } # warn Dumper \@rows; exit;
        my $tmpl = _get_template();
        my $content = sprintf $tmpl, $sample, join "\n", @rows;
       
        my $file = sprintf './analysis/%s/%s.html', $n, $filename;
        $content > io($file);        
        
        my $c = NGS::ChartMaker->new(%chart);
        $c->make_chart();
    }
}

exit 0;

#===============================================================================

sub parse {
	my $filename = shift;
	my @lines = io($filename)->slurp; # warn Dumper \@lines;
	
	my %local = (); # indexer localised to individual file
	LINE: for (@lines) {
		next LINE if /^#/; # skip comments
		my @fields = split "\t", $_;
		
		my $start_pos = $fields[1] || next LINE; # warn $start_pos;
        # initialise intra-file start position indexer:
		$local{$start_pos} ||= 1; # only need true/false, not item frequency
		
        { # add data as array(ref) - can have multiple start pos entries per file:
            my $data = _parse_fields(\@fields);            
            push @{ $h{$start_pos}{data} }, $data;
        }
	}
	# auto-increment %h start position key for each start position key in %i:
	for my $start_position (keys %local) { $h{$start_position}{count}++ }
}

sub by_frequency { # 1st by frequency, 2nd by start position:
	return $h{$b}{count} <=> $h{$a}{count} || $a <=> $b;
}

sub by_var_freq {
	return $b->{vf} <=> $a->{vf} || $a->{depth} <=> $b->{depth};
}

sub by_start_pos {
	return $a <=> $b;
}

sub _get_template {
    return qq!
        <html>
        <head>
            <style>
                table { border-collapse: collapse }
                td { border: solid 1px #c0c0c0; padding: 2px }
                thead tr { background-color: ActiveCaption; color: CaptionText; }
            </style>
        </head>
        <body>
            <h2>%s</h2>
            <table>
                <thead>
                    <tr>
                        <th>position</th>
                        <th>vf</th>
                        <th>alt</th>
                        <th>ref</th>
                        <th>gene</th>
                        <th>filter</th>
                        <th>depth</th>
                    <tr>
                </thead>
                %s
            </table>
        </body>
        </html>!;
}

sub _parse_fields {
    my $data = shift; # arrayref of tab-separated vcf row entries
	
	my ($ref,$alt) = @{$data}[3,4]; # warn Dumper [$ref,$alt];
	my $filter     = $data->[6]; # warn $filter;
	my ($depth)    = $data->[7] =~ /DP=(\d+)/; # warn $depth;
    my ($gene)     = $data->[7] =~ /GI=(.*)\;/; # warn $gene
	my $results    = $data->[-1]; # warn $results;		
	my $vf = (split ':', $results)[3]; # warn $vf;

    my %h = (
		filter => $filter,
		depth  => $depth,
        gene   => $gene,
		ref    => $ref,
		alt    => $alt,
		vf     => $vf,
	);
    return \%h;
}