P3X GitList

#!/usr/bin/env perl

=begin
analyses patient sample vcf files for variant frequencies across multiple runs.
Produces 1 table + 1 graph for each sample in each of n = 1, 2, 3, etc directories.
n values correspond to frequency each variant found across multiple runs, so in
n = 1, all html tables list variants found only once for that sample across all
runs analysed. In n = 2, tables list variants found in 2 out of total number of
runs, etc. Each variant can appear >= once per vcf file (html table).
=cut

use lib '/home/raj/perl5/lib/perl5';

use Modern::Perl '2012';
use Data::Dumper::Concise;
use File::Path qw(make_path remove_tree); # IO::All PITA for dir manipulation
use Path::Tiny;
use FindBin; # warn $FindBin::Bin;
use IO::All;
use CGI;

use lib $FindBin::RealBin . '/../../lib';
use NGS::ChartMaker;
use DBIx::Simple;

my $src_dir = $FindBin::Bin . '/src/analyse';
my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs; exit;
    
my %samples;
# collect filenames from each dir, add to %samples hash:
for my $d (@dirs) { # warn Dumper $_;
    next unless $d->type eq 'dir'; # skip eg png's
	my @files = io($d)->all; # Dumper \@files;  
	for my $f (@files) { # warn $f->filename;
		my $filename = $f->filename;
		next unless $filename =~ /vcf\Z/; # warn $f->filename;
		push @{ $samples{$filename} }, $f; # io object
	}
} # warn Dumper \%samples; exit;

# delete and recreate analysis output dirs, one for each src dir:
for ( my $i = 1; $i <= @dirs; $i++ ) { # warn $i;
    my $dir = $FindBin::Bin . '/analysis/' . $i;
    remove_tree($dir) if -e $dir;  # not worth the hassle using IO::All   
    make_path($dir); # ditto
}

my $dbix = _build_dbix(); # warn Dumper $dbix;
my $EXCLUDES = _get_excludes(); # warn Dumper $EXCLUDES;

my $q = CGI->new;
my %h = (); # start-position indexer (global, used in parse(), reset in FILE loop)

my %chart = (
    x_title => 'Depth',
    y_title => 'VF',
);
my %all_data;

# params to collect from vcf file:
my @params = qw( position vf alt ref gene filter depth runId);

FILE: # check each file is represented in all dirs:
while ( my($sample, $o) = each %samples ) {
	unless ( scalar @$o == scalar @dirs ) { # one object per file per directory
		printf "%s seen only %s times\n", $sample, scalar @$o;
		next FILE;
	}
    
	%h = (); # reset
	# parse file content in each directory, %h holds array of data
	for my $f(@$o) { # warn $f->name;
		parse($f);
	} # warn Dumper \%h;
	
	my %data; # array of all results (href) for this sample:
	while ( my($position, $d) = each %h ) { # warn Dumper $d;
		my ( $n, $results ) = ( $d->{count}, $d->{data} );

		for my $result(@$results) { 
			# add position to $result href:
			$result->{position} = $position;
			push @{ $data{$n} }, $result;
		}
	}

    $sample =~ s/\.vcf//; # warn $sample;
    
    # now have hash of all results from all dirs for this sample, keys = $n (1,2,3,etc)
    while ( my($n, $result) = each %data ) {
        my @rows = (); # reset
        $chart{data} = {}; # reset
        # sort each row by variation frequency (highest -> lowest):
        for my $row ( sort by_var_freq @$result ) {
            my $position = $row->{position};
            
            push @rows, $q->Tr( $q->td( [ @{$row}{@params} ] ) );
            
            push @{ $chart{data}{x} }, $row->{depth};
			push @{ $all_data{$n}{x} }, $row->{depth};
			
            push @{ $chart{data}{y} }, $row->{vf};
			push @{ $all_data{$n}{y} }, $row->{vf};
        } # warn Dumper \@rows; exit;

        my $total_variants = do {
            my %h; $h{ $_->{position} }++ for @$result;
            scalar keys %h;
        };
        my $content = do {
            my %d = (
                sample => $sample,
                total  => $total_variants,
                rows   => \@rows,
            );
            _template(\%d);
        };

        my $file = sprintf './analysis/%s/%s.html', $n, $sample;
        $content > io($file);        
        
        $chart{chart_title} = $sample; # warn Dumper scalar @{ $chart{data}{x} };
        $chart{data_title}  = sprintf '%s/%s', $n, $sample;
        NGS::ChartMaker->new(%chart)->make_chart();
    }
}

# output combined graphs, one each for n=1, n=2:
for (1,2) {
	my %h = (
		x_title => 'Depth',
		y_title => 'VF',
		chart_title => "All data $_",
		data_title => 'all_data_'.$_,
		data => $all_data{$_},
	); # warn Dumper scalar @{ $h{data}{y} };
	NGS::ChartMaker->new(%h)->make_chart();
}

# expand x-axis scale (0-4000) on n=1:
{
	my %h = (
		x_title => 'Depth',
		y_title => 'VF',
		max_x_val => 4000,
		chart_title => 'All data #1 expanded',
		data_title => 'all_data_1_expanded',
		data => $all_data{1},
	);
	NGS::ChartMaker->new(%h)->make_chart();
}

exit 0;

#===============================================================================

sub parse {
	my $f = shift; # io object
    
	my @folders = $f->splitdir; # warn $folders[-2]; # array of dirs in full path
	my @lines   = io($f->name)->slurp; # warn Dumper \@lines;
    
	my %local = (); # indexer localised to individual file

	LINE: for (@lines) {
		next LINE if /^#/; # skip comments
		my @fields = split "\t", $_;
		
		my $start_pos = $fields[1]; # warn $start_pos;
			# warn $start_pos if $EXCLUDES->{$start_pos};
		next LINE if (! $start_pos) || $EXCLUDES->{$start_pos};  
			# warn $start_pos if $EXCLUDES->{$start_pos};

        # initialise intra-file start position indexer:
		$local{$start_pos} ||= 1; # only need true/false, not item frequency
		
        { # add data as array(ref) - can have multiple start pos entries per file:
            my $data = _parse_fields(\@fields);
            # add run directory name:
            $data->{runId} = $folders[-2]; # warn $data->{runId}; # 2nd to last
            push @{ $h{$start_pos}{data} }, $data;
        }
	}
	# auto-increment %h start position key for each start position key in %i:
	for my $start_position (keys %local) { $h{$start_position}{count}++ }
}

sub by_var_freq {
	return $b->{vf} <=> $a->{vf} || $a->{depth} <=> $b->{depth};
}

sub _template {
    my $args = shift; # sample name (str), total (int), rows (arrayref)
    
    my ($sample, $total, $rows) = @{$args}{qw/sample total rows/};
    
    return sprintf <<"HTML", join "\n", @$rows;
    <!DOCTYPE html>
    <html lang="en">
      <head>
        <style>
            html { font-family: verdana, sans-serif; }
            h2 { color: #800000; }
            h3 { }
            table { border-collapse: collapse; margin-left: 1em }
            td, th { border: solid 1px #c0c0c0; padding: 2px; font-size: 9pt; }
            #head { background-color: ActiveCaption; color: CaptionText; }
        </style>
      </head>
      <body>
        <h2>${sample}.vcf</h2>
        <h3>Total variants: $total</h3>
        <table>
          <tr id="head">
            <th>position</th>
            <th>vf</th>
            <th>alt</th>
            <th>ref</th>
            <th>gene</th>
            <th>filter</th>
            <th>depth</th>
            <th>run Id</th>
          <tr>
          <tbody>%s</tbody>
        </table>
      </body>
    </html>
HTML
}

sub _build_dbix {
    my $db = path($FindBin::RealBin, '..', '..', 'ngs.sqlite')->realpath;
	my $dsn = "dbi:SQLite:dbname=$db"; # warn 'DB:'. $db;

	my $dbix = DBIx::Simple->connect($dsn); # warn Dumper $dbix;
    return $dbix;
}

sub _get_excludes {
	$dbix->select('locations', ['position', 1], {action => 'exclude'})->map;
}

sub _parse_fields {
    my $data = shift; # arrayref of tab-separated vcf row entries

	my $ref     = $data->[3]; # warn $ref;
	my $alt     = $data->[4]; # warn $alt;
	my $filter  = $data->[6]; # warn $filter;
    my $info    = $data->[7]; # warn $info
	my $results = $data->[9]; # warn $results;
    
	my $vf = (split ':', $results)[3]; # warn $vf;
	my ($depth) = $info =~ /DP=(\d+)/; # warn $depth;
    my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match

    my $gene = do { # remove duplicate genes: eg TP53,TP53,TP53
        no warnings 'uninitialized'; # eg if regex failed due to malformed str
        my %seen = map { $_ => 1 } split ',', $genes;    
        join ',', keys %seen;
    } || '[UNDEF]';
    my %h = (
		filter => $filter,
		depth  => $depth,
        gene   => $gene,
		ref    => $ref,
		alt    => $alt,
		vf     => $vf,
	);
    return \%h;
}