#!/usr/bin/env perl # analyses individual samples supplied by argv[0] across multiple directories use lib '/home/raj/perl5/lib/perl5'; use Modern::Perl '2012'; use Data::Dumper; use FindBin; # warn $FindBin::Bin; use IO::All; use lib $FindBin::Bin . '/../../lib'; use NGS::ChartMaker; my $src = $ARGV[0]; unless ($src) { print "need a vcf file identifier !!\n\n"; exit; } my $src_dir = $FindBin::Bin . '/src'; my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs; my %h; # start-position indexer # data structure for chartdirector: my %chart = ( data_title => $src ); # extract files from each dir, skip all except matching supplied filename: for my $d (@dirs) { # warn Dumper $_; next unless $d->type eq 'dir'; # skip eg png's my @files = io($d)->all; # Dumper \@files; for my $f (@files) { # warn $f->filename; next unless $f->filename =~ /^$src/; # warn $f->filename; parse($f->name); } } # warn Dumper \%h; exit; # printf "| %9s | %s |\n", $_, $h{$_}{count} for sort by_frequency keys %h; # map print Dumper $h{$_}, grep { $h{$_}{count} == 1 } sort by_start_pos keys %h; my $format = "| %9s | %6s | %8s | %10s | %4s | %6s |\n"; my @dividr = ( '=' x 9, '=' x 6, '=' x 8, '=' x 10, '=' x 4, '=' x 6 ); printf $format, qw( START REF ALT FILTER DP VF ); printf $format, @dividr; for my $location ( sort by_start_pos keys %h ) { # warn $location; next unless $h{$location}{count} == 1; my $data = $h{$location}{data}; # warn Dumper $data; for (@$data) { push @{ $chart{data}{x} }, $_->{depth}; push @{ $chart{data}{y} }, $_->{vf}; } printf $format, $location, @{$_}{ qw/ref alt filter depth vf/ } for @$data; printf $format, @dividr; } # generate chart of depth (x) vs variable frequency (y): @chart{ qw/x_title y_title chart_title/ } = ( 'Depth', 'VF', 'VCF analysis' ); my $c = NGS::ChartMaker->new(%chart); $c->make_chart(); exit 0; #=============================================================================== sub parse { my $filename = shift; my @lines = io($filename)->slurp; # warn Dumper \@lines; my %local = (); # indexer localised to individual file LINE: for (@lines) { next LINE if /^#/; # skip comments my @fields = split "\t", $_; my $start_pos = $fields[1] || next LINE; # warn $start_pos; # initialise intra-file start position indexer: $local{$start_pos} ||= 1; # only need true/false, not item frequency { # add data as array(ref) - can have multiple start pos entries per file: my $data = _parse_fields(\@fields); push @{ $h{$start_pos}{data} }, $data; } } # auto-increment %h start position key for each start position key in %i: for my $start_position (keys %local) { $h{$start_position}{count}++ } } sub by_frequency { # 1st by frequency, 2nd by start position: return $h{$b}{count} <=> $h{$a}{count} || $a <=> $b; } sub by_start_pos { return $a <=> $b; } sub _parse_fields { my $data = shift; # arrayref of tab-separated vcf row entries my ($ref,$alt) = @{$data}[3,4]; # warn Dumper [$ref,$alt]; my $filter = $data->[6]; # warn $filter; my ($depth) = $data->[7] =~ /DP=(\d+)/; # warn $depth; my $results = $data->[-1]; # warn $results; my $vf = (split ':', $results)[3]; # warn $vf; my %h = ( filter => $filter, depth => $depth, ref => $ref, alt => $alt, vf => $vf, ); return \%h; }