#!/usr/bin/env perl =begin analyses patient sample vcf files for variant frequencies across multiple runs. Produces 1 table + 1 graph for each sample in each of n = 1, 2, 3, etc directories. n values correspond to frequency each variant found across multiple runs, so in n = 1, all html tables list variants found only once for that sample across all runs analysed. In n = 2, tables list variants found in 2 out of total number of runs, etc. Each variant can appear >= once per vcf file (html table). =cut use lib '/home/raj/perl5/lib/perl5'; use Modern::Perl '2012'; use Data::Dumper::Concise; use File::Path qw(make_path remove_tree); # IO::All PITA for dir manipulation use Path::Tiny; use FindBin; # warn $FindBin::Bin; use IO::All; use CGI; use lib $FindBin::RealBin . '/../../lib'; use NGS::ChartMaker; use DBIx::Simple; my $src_dir = $FindBin::Bin . '/src/analyse'; my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs; exit; my %samples; # collect filenames from each dir, add to %samples hash: for my $d (@dirs) { # warn Dumper $_; next unless $d->type eq 'dir'; # skip eg png's my @files = io($d)->all; # Dumper \@files; for my $f (@files) { # warn $f->filename; my $filename = $f->filename; next unless $filename =~ /vcf\Z/; # warn $f->filename; push @{ $samples{$filename} }, $f; # io object } } # warn Dumper \%samples; exit; # delete and recreate analysis output dirs, one for each src dir: for ( my $i = 1; $i <= @dirs; $i++ ) { # warn $i; my $dir = $FindBin::Bin . '/analysis/' . $i; remove_tree($dir) if -e $dir; # not worth the hassle using IO::All make_path($dir); # ditto } my $dbix = _build_dbix(); # warn Dumper $dbix; my $EXCLUDES = _get_excludes(); # warn Dumper $EXCLUDES; my $q = CGI->new; my %h = (); # start-position indexer (global, used in parse(), reset in FILE loop) my %chart = ( x_title => 'Depth', y_title => 'VF', ); my %all_data; # params to collect from vcf file: my @params = qw( position vf alt ref gene filter depth runId); FILE: # check each file is represented in all dirs: while ( my($sample, $o) = each %samples ) { unless ( scalar @$o == scalar @dirs ) { # one object per file per directory printf "%s seen only %s times\n", $sample, scalar @$o; next FILE; } %h = (); # reset # parse file content in each directory, %h holds array of data for my $f(@$o) { # warn $f->name; parse($f); } # warn Dumper \%h; my %data; # array of all results (href) for this sample: while ( my($position, $d) = each %h ) { # warn Dumper $d; my ( $n, $results ) = ( $d->{count}, $d->{data} ); for my $result(@$results) { # add position to $result href: $result->{position} = $position; push @{ $data{$n} }, $result; } } $sample =~ s/\.vcf//; # warn $sample; # now have hash of all results from all dirs for this sample, keys = $n (1,2,3,etc) while ( my($n, $result) = each %data ) { my @rows = (); # reset $chart{data} = {}; # reset # sort each row by variation frequency (highest -> lowest): for my $row ( sort by_var_freq @$result ) { my $position = $row->{position}; push @rows, $q->Tr( $q->td( [ @{$row}{@params} ] ) ); push @{ $chart{data}{x} }, $row->{depth}; push @{ $all_data{$n}{x} }, $row->{depth}; push @{ $chart{data}{y} }, $row->{vf}; push @{ $all_data{$n}{y} }, $row->{vf}; } # warn Dumper \@rows; exit; my $total_variants = do { my %h; $h{ $_->{position} }++ for @$result; scalar keys %h; }; my $content = do { my %d = ( sample => $sample, total => $total_variants, rows => \@rows, ); _template(\%d); }; my $file = sprintf './analysis/%s/%s.html', $n, $sample; $content > io($file); $chart{chart_title} = $sample; # warn Dumper scalar @{ $chart{data}{x} }; $chart{data_title} = sprintf '%s/%s', $n, $sample; NGS::ChartMaker->new(%chart)->make_chart(); } } # output combined graphs, one each for n=1, n=2: for (1,2) { my %h = ( x_title => 'Depth', y_title => 'VF', chart_title => "All data $_", data_title => 'all_data_'.$_, data => $all_data{$_}, ); # warn Dumper scalar @{ $h{data}{y} }; NGS::ChartMaker->new(%h)->make_chart(); } # expand x-axis scale (0-4000) on n=1: { my %h = ( x_title => 'Depth', y_title => 'VF', max_x_val => 4000, chart_title => 'All data #1 expanded', data_title => 'all_data_1_expanded', data => $all_data{1}, ); NGS::ChartMaker->new(%h)->make_chart(); } exit 0; #=============================================================================== sub parse { my $f = shift; # io object my @folders = $f->splitdir; # warn $folders[-2]; # array of dirs in full path my @lines = io($f->name)->slurp; # warn Dumper \@lines; my %local = (); # indexer localised to individual file LINE: for (@lines) { next LINE if /^#/; # skip comments my @fields = split "\t", $_; my $start_pos = $fields[1]; # warn $start_pos; # warn $start_pos if $EXCLUDES->{$start_pos}; next LINE if (! $start_pos) || $EXCLUDES->{$start_pos}; # warn $start_pos if $EXCLUDES->{$start_pos}; # initialise intra-file start position indexer: $local{$start_pos} ||= 1; # only need true/false, not item frequency { # add data as array(ref) - can have multiple start pos entries per file: my $data = _parse_fields(\@fields); # add run directory name: $data->{runId} = $folders[-2]; # warn $data->{runId}; # 2nd to last push @{ $h{$start_pos}{data} }, $data; } } # auto-increment %h start position key for each start position key in %i: for my $start_position (keys %local) { $h{$start_position}{count}++ } } sub by_var_freq { return $b->{vf} <=> $a->{vf} || $a->{depth} <=> $b->{depth}; } sub _template { my $args = shift; # sample name (str), total (int), rows (arrayref) my ($sample, $total, $rows) = @{$args}{qw/sample total rows/}; return sprintf <<"HTML", join "\n", @$rows;

${sample}.vcf

Total variants: $total

%s
HTML } sub _build_dbix { my $db = path($FindBin::RealBin, '..', '..', 'ngs.sqlite')->realpath; my $dsn = "dbi:SQLite:dbname=$db"; # warn 'DB:'. $db; my $dbix = DBIx::Simple->connect($dsn); # warn Dumper $dbix; return $dbix; } sub _get_excludes { $dbix->select('locations', ['position', 1], {action => 'exclude'})->map; } sub _parse_fields { my $data = shift; # arrayref of tab-separated vcf row entries my $ref = $data->[3]; # warn $ref; my $alt = $data->[4]; # warn $alt; my $filter = $data->[6]; # warn $filter; my $info = $data->[7]; # warn $info my $results = $data->[9]; # warn $results; my $vf = (split ':', $results)[3]; # warn $vf; my ($depth) = $info =~ /DP=(\d+)/; # warn $depth; my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match my $gene = do { # remove duplicate genes: eg TP53,TP53,TP53 no warnings 'uninitialized'; # eg if regex failed due to malformed str my %seen = map { $_ => 1 } split ',', $genes; join ',', keys %seen; } || '[UNDEF]'; my %h = ( filter => $filter, depth => $depth, gene => $gene, ref => $ref, alt => $alt, vf => $vf, ); return \%h; }