#!/usr/bin/env perl =begin generates vep input file from MiSeq vcf src files for single MiSeq run =cut BEGIN { use lib '/home/raj/perl5/lib/perl5'; # ensure FindBin from local-lib use FindBin; # warn $FindBin::Bin; exit; use lib ( $FindBin::Bin . '/../../lib', $FindBin::Bin . '/../..', # so NGS::DB can find ngs.sqlite db ); } use Data::Dumper::Concise; use Modern::Perl '2012'; use IO::All; use NGS::DB; my $db = NGS::DB->new(); # warn $db->dbix; my $hotspots = $db->dbix->select('hotspots', ['position', 1])->map; # my $gene_strand = $db->dbix->select('gene_strand', [qw/gene_name strand/])->map; # warn Dumper [$hotspots, $gene_strand]; exit; my $src_dir = $FindBin::Bin . '/src'; my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs; # cols for vep input file: my @cols = qw(chromosome start_pos end_pos allele strand sample_id); # create new vep input file for each MiSeq run (ie dir of vcf files): for my $d (@dirs) { # warn $d->dir; next; next unless $d->type eq 'dir'; # skip any files my @files = io($d)->all; # Dumper \@files; # extract dir name and path from $d io object: my ( $miseq_path, $miseq_run ) = ( $d->dir, $d->filename ); my @vep = (); # reset array - to hold rows for vep input file FILE: for my $f (@files) { # warn $f->filename; my $vcf_file = $f->filename; my $sample_id = join '/', $miseq_run, $vcf_file; next FILE unless $vcf_file =~ /vcf\Z/; # skip unless .vcf suffix my $data = _parse_file($f); # warn Dumper $data; # AoH for my $row (@$data) { # warn Dumper $row; # hashref # add sample ID (needs to be done in this block to get $miseq_run var): $row->{sample_id} = $sample_id; # write out vars as tab-delimited row to @vep array: my $vars = join "\t", @{$row}{@cols}; push @vep, $vars; } } { # create vep input file in MiSeq src data root dir: my $content = join "\n", @vep; my $vep_file = sprintf '%s/../%s.vep', $miseq_path, $miseq_run; $content > io($vep_file); # warn $vep_file; } } #=============================================================================== sub _parse_file { my $f = shift; # io object my @rows = io($f->name)->slurp; # warn Dumper \@rows; my @data = (); # for AoH ROW: for (@rows) { next ROW if /^#/; # skip comments my @fields = split "\t", $_; my $vals = _parse_fields(\@fields) || next ROW; # $vals = hashref push @data, $vals; } return \@data; } sub _parse_fields { my $data = shift; # arrayref of tab-separated vcf row entries return 0 unless $data->[0] =~ /^chr/; # or rest of regexes will fail my $chr = $data->[0]; # warn $chr; # eg chrX, chr2, etc my $pos = $data->[1]; # warn $pos; # numerical my $ref = $data->[3]; # warn $ref; # str of 1 or more A,C,T,G bases my $alt = $data->[4]; # warn $alt; # str of 1 or more A,C,T,G bases my $info = $data->[7]; # warn $info # DP, TI, GI, FC, GT, etc values my $results = $data->[9]; # warn $results; # colon-separated results (eg VF) my $vf = (split ':', $results)[3] || warn 'no VF info'; # warn $vf; # skip unless VF > 5% cutoff or start position is a known hotspot: return 0 unless $vf > 0.05 || $hotspots->{$pos}; my ($depth) = $info =~ /DP=(\d+)/; # warn $depth; my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match my $allele = undef; # defined in block below # define initial start & end positions = vcf 'position' var (2nd col): my $start_pos = my $end_pos = $pos; # define allele and refine start & end positions depending on ins/del/sub: if ( length $alt > 1 ) { # base insertion: $start_pos += 1; # end pos stays same $allele = sprintf '-/%s', substr($alt, 1); # alt minus 1st base } elsif ( length $ref > 1 ) { # base deletion: $start_pos += 1; $end_pos += length($ref) - 1; $allele = sprintf '%s/-', substr($ref, 1); # ref minus 1st base } else { # alt & ref both single ie base substitution: $allele = sprintf '%s/%s', $ref, $alt; # warn $allele; } my %h = ( chromosome => substr($chr, 3), # remove leading 'chr' chars start_pos => $start_pos, end_pos => $end_pos, allele => $allele, ); # warn Dumper \%h; =begin # not splitting genes, not using gene_strand table my @genes = do { # remove duplicate genes: eg TP53,TP53,TP53 no warnings 'uninitialized'; # eg if regex failed due to malformed str my %seen = map { $_ => 1 } split ',', $genes; keys %seen; } || warn 'no GI entry'; # warn @genes; GENE: for my $gene (@genes) { # + or - strand from gene_strand table: my $strand = $gene_strand->{$gene}; unless ($strand) { warn "$gene not in gene_strand table"; next GENE; } # warn Dumper [ $chromosome, $start_pos, $end_pos, $allele, $strand ]; $h{strand} = $strand; } =cut $h{strand} = '+'; # not using gene_strand table return \%h; }