#!/usr/bin/env perl =begin generates vep input file from myseq vcf src files for single myseq run =cut BEGIN { use lib '/home/raj/perl5/lib/perl5'; # ensure FindBin from local-lib use FindBin; # warn $FindBin::Bin; exit; use lib ( $FindBin::Bin . '/../../lib', $FindBin::Bin . '/../..', # for ngs.sqlite db ); } use Data::Dumper::Concise; use Modern::Perl '2012'; use IO::All; use NGS::DB; my $db = NGS::DB->new(); # warn $db->dbix; my $gene_strand = $db->dbix->select('gene_strand', [qw/gene_name strand/])->map; my $hotspots = $db->dbix->select('hotspots', ['position', 1])->map; # warn Dumper [$hotspots, $gene_strand]; exit; my $src_dir = $FindBin::Bin . '/src'; my @dirs = io($src_dir)->all; # print "$_ is a " . $_->type . "\n" for @dirs; my $a = my $b = 0; my %samples; # collect filenames from each dir, add to %samples hash: for my $d (@dirs) { # warn Dumper $_; next unless $d->type eq 'dir'; # skip eg png's my @files = io($d)->all; # Dumper \@files; for my $f (@files) { # warn $f->filename; my $filename = $f->filename; next unless $filename =~ /vcf\Z/; # warn $f->filename; push @{ $samples{$filename} }, $f; # io object } } # warn Dumper keys %samples; exit; my @vep; FILE: while ( my($sample, $o) = each %samples ) { # parse file content in each directory: for my $f(@$o) { # warn $f->name; my $data = parse($f); # warn Dumper $data; # arrayref push @vep, @$data; } warn Dumper @vep; } warn $a; warn $b; sub parse { my $f = shift; # io object my @folders = $f->splitdir; # warn $folders[-2]; # array of dirs in full path my @lines = io($f->name)->slurp; # warn Dumper \@lines; my @data = (); LINE: for (@lines) { next LINE if /^#/; # skip comments my @fields = split "\t", $_; my $start_pos = $fields[1] || next LINE; # warn $start_pos; { my $vals = _parse_fields(\@fields) || next LINE; # hashref # add sample name: $vals->{sample} = $folders[-1]; # warn $folders[-1]; push @data, $vals; } } return \@data; } sub _parse_fields { my $data = shift; # arrayref of tab-separated vcf row entries my $chr = $data->[0]; # warn $chr; my $pos = $data->[1]; # warn $pos; my $ref = $data->[3]; # warn $ref; my $alt = $data->[4]; # warn $alt; my $filter = $data->[6]; # warn $filter; my $info = $data->[7]; # warn $info my $results = $data->[9]; # warn $results; my $vf = (split ':', $results)[3]; # warn $vf; my ($depth) = $info =~ /DP=(\d+)/; # warn $depth; my ($genes) = $info =~ /GI=(.*?)\;/; # warn $genes; # non-greedy match my ($chromosome) = $chr =~ /chr([\w\d]+)/; # warn Dumper [$chr, $chromosome]; my @genes = do { # remove duplicate genes: eg TP53,TP53,TP53 no warnings 'uninitialized'; # eg if regex failed due to malformed str my %seen = map { $_ => 1 } split ',', $genes; keys %seen; } || warn 'no GI entry'; # warn @genes; #$a++; # skip unless VF > 5% cutoff or start position is a known hotspot: return 0 unless $vf > 0.05 || $hotspots->{$pos}; #$b++; my $start_pos = my $end_pos = $pos; my $allele; # insertion: if ( length $alt > 1 ) { $start_pos += 1; my @bases = split //, $alt; # warn Dumper \@bases; shift @bases; $allele = sprintf '-/%s', join '', @bases; # warn Dumper [$alt,$allele]; } # deletion: elsif ( length $ref > 1 ) { $start_pos += 1; $end_pos += ( (length $ref) - 1 ); my @bases = split //, $ref; # warn Dumper \@bases; shift @bases; $allele = sprintf '%s/-', join '', @bases; # warn Dumper [$ref,$allele]; } # warn Dumper [$pos,$alt,$ref,$start_pos,$end_pos]; # substitution: $allele ||= sprintf '%s/%s', $ref, $alt; # warn $allele; my %h = ( chromosome => $chromosome, start_pos => $start_pos, end_pos => $end_pos, allele => $allele, strand => '+', # don't need to use gene_strand table ); =begin # not splitting genes, not using gene_strand table GENE: for my $gene (@genes) { # + or - strand from gene_strand table: my $strand = $gene_strand->{$gene}; unless ($strand) { warn "$gene not in gene_strand table"; next GENE; } # warn Dumper [ $chromosome, $start_pos, $end_pos, $allele, $strand ]; $h{strand} = $strand; } =cut return \%h; }