#!/usr/bin/env perl
=begin -------------------------------------------------------------------------
molecular data extract for NCRAS (PHE) - compliments ncrs_cosd.pl with individual
test results; gets HTS data from Hydra HTS db
=cut
#-------------------------------------------------------------------------------
my $query_output = 0; # --query|q - output sql queries to console
my $cumulative = 0; # --cumulative - all requests since 1st of month of ref_date
my $time_this = 0; # --time_this (Time::HiRes)
my $duration = 1; # month; default unless specified in command-line opts
my $testing = 0; # --testing|t - nothing yet
use Getopt::Long;
GetOptions (
"cumulative" => \$cumulative, # flag
"months|m=i" => \$duration, # int
"testing|t" => \$testing, # flag
"time_this" => \$time_this, # flag
"query|q" => \$query_output, # flag
); # warn $duration; warn $cumulative; exit;
use strict;
use warnings;
use feature 'say';
use lib '/home/raj/perl5/lib/perl5';
use FindBin qw($Bin); # warn $Bin;
use Clone 'clone';
use Config::JSON;
use Data::Printer;
use SQL::Abstract::More;
use Spreadsheet::WriteExcel::Simple;
use Time::HiRes qw(gettimeofday tv_interval);
use lib $Bin . '/../../../lib';
use LIMS::Local::ScriptHelpers;
use lib '/home/raj/perl-lib';
use Local::DB;
#===============================================================================
my @recipients = (
'phe.ecricin.secure',
'paul.glover.secure',
'raj.secure',
);
#===============================================================================
# check HTS db config file exists (not git controlled) or die:
my $hts_db_config = $Bin . '/hts.cfg';
-e $hts_db_config or die 'require HTS db connection params';
my $tools = LIMS::Local::ScriptHelpers->new();
$tools->test_only($testing); # defaults to 0
my $config = $tools->config();
my $today = $tools->time_now;
my $dbix = Local::DB->dbix({ dbname => 'hilis4' });
$dbix->lc_columns = 0; # preserve mixed case on col names
# remote HTS database connection:
my $dbix_hts = do {
my $cfg = Config::JSON->new($hts_db_config); # p $cfg->config; # auto-dies on failure to load
my $ref = $cfg->get('params'); # p $ref;
Local::DB->dbix($ref);
}; # p $dbix_hts;
# need to supply our own injection_guard to override built-in sql injection
# attack prevention which detects ';' in GROUP_CONCAT, but we don't need one
# here as we have no user input:
my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ );
$Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 1; # or will defeat DATE(col)
my $ref_date = $today->clone->subtract( months => $duration ); # warn $ref_date;
my $subject = sprintf 'HMDS molecular data (%s)', $ref_date->strftime('%b_%Y');
my $filename = sprintf 'hmds_molecular_data_%s.xls',
lc $ref_date->strftime('%b %Y'); # warn $filename;
my @lab_sections = (
'FISH', 'Molecular', 'Micro-array', 'High-throughput sequencing',
'Multiplex Ligation-dependent Probe Amplification'
);
my $excluded_test_name_re = join '|', qw( extraction selection quantification
control h_and_e refer_material store_[dr]na ); # p $excluded_test_name_re;
# get lab-tests auto-requested at registration:
my $registration_lab_tests = registration_lab_test_requests(); # p $registration_lab_tests;
my ($sql, @bind) = _get_main_query(); # p $sql; p @bind;
my $t0 = [gettimeofday];
my $query = $dbix->query( $sql, @bind );
# get cols from query, except 'private' ones used for evaluations (eg _specimen):
my @headers = grep $_ !~ /^_/, $query->columns; # p @headers;
# my @data = $query->hashes; runtimer('query runtime') if $time_this;
my $xl = Spreadsheet::WriteExcel::Simple->new;
$xl->write_bold_row(\@headers);
my @rows; # global for processed data
runtimer('commencing data processing') if $time_this;
while ( my $ref = $query->hash ) { # p $ref->{_request_id};
process_data($ref); # pushes processed data onto @rows
}
runtimer('completed data processing') if $time_this;
$xl->write_row($_) for @rows;
#===============================================================================
if ($testing) {
$xl->save($Bin . '/' . $filename);
}
else {
my %mail = (
config => $config,
subject => $subject,
filename => $filename,
attachment => $xl->data,
);
$tools->send_mail(\%mail, \@recipients);
}
#===============================================================================
sub process_data {
my $data = shift;
$data->{test_requested} = test_request_date($data);
{ # remove single char clinical details & trailing '.', undef -> 'not recorded':
no warnings 'uninitialized'; # some legacy cases (before March 2017)
$data->{clinical_indication} =~ s/\.$//; # so dot-only becomes undef
$data->{clinical_indication} ||= '[not recorded]';
$data->{clinical_indication} = '[not recorded]' unless
length $data->{clinical_indication} > 1;
}
my $specimen = $data->{_specimen}; # p $sample_type;
$data->{sample_type} = sample_type($specimen);
$data->{sample_prep} = sample_prep($specimen);
# test_method expects a technology:
$data->{test_method} =~ s{Molecular}{PCR/Sanger}; # others OK (eg FISH, HTS)
# get HTS params (cdna_change, protein_impact, refseq_transcript, cosmic_id)
# can be >1 row per gene/probe:
if ( my @data = retrieve_sequencing_data($data) ) { # AoH, or undef if not HTS
# p { $data->{_request_number} => \@data };
for my $ref (@data) { # href
my @col_names = keys %{$ref};
my @results = values %{$ref};
# clone $data & merge HTS results:
my $c = clone $data;
@{$c}{@col_names} = @results; # $c->{$_} = $ref->{$_} for keys %$ref;
add_to_rows($c);
}
}
else { # no HTS data:
add_to_rows($data);
}
}
sub add_to_rows { my $href = shift; push @rows, [ @{$href}{@headers} ] }
sub retrieve_sequencing_data { # from HTS db:
my $data = shift; # p $data;
return undef unless $data->{test_method} =~ 'sequencing';
my $request_number = $data->{_request_number};
my $test_name = $data->{test_name};
my $year = $data->{_year};
$test_name =~ s/(FLT3)-TKD/$1/; # HTS db uses FLT3 not FLT3-TDK
$test_name =~ s/c-(KIT)/$1/; # HTS db uses KIT not c-KIT
my @cols = ( # ** must match HTS cols in _get_main_query() **
'gt.transcript_name|refseq_transcript',
'v.cNomen|cdna_change',
'v.pNomen|protein_impact',
'v.consequence',
'va.cosmic_id',
'vasr.vaf|vaf',
);
my @rels = ( 'clinically_reported_variant|crv' ,
'crv.clinically_reported_sample_id=crs.id'
=> 'clinically_reported_sample|crs' ,
'crv.final_vasr_id=vasr.id' => 'variant_annotation_sample_run|vasr' ,
'vasr.variant_annotation_id=va.id'
=> 'variant_annotation|va' ,
'va.variant_id=v.id' => 'variant|v' ,
'v.gene_transcript_id=gt.id' => 'gene_transcript|gt' ,
'gt.gene_id=g.id' => 'gene|g' ,
'crs.sample_id=s.id' => 'sample|s' ,
);
my %where = (
'crv.final_reported' => 1,
's.request_number' => $request_number,
'g.gene_name' => $test_name,
's.year' => $year,
);
my @args = (
-columns => \@cols,
-from => [ -join => @rels ],
-where => \%where,
); # p @args;
my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
$dbix->dump_query($sql, @bind) if $query_output; # exit;
my @results = $dbix_hts->query($sql, @bind)->hashes;
return wantarray ? @results : \@results;
}
sub sample_type { # record multiple sample types if provided:
my $specimen = shift;
my @specimens;
if ( $specimen =~ /blood/ ) { # p $specimen;
push @specimens, 'Blood';
}
if ( $specimen =~ /bone marrow/ ) { # p $specimen;
push @specimens, 'Bone marrow';
}
if ( $specimen =~ /CSF/ ) { # p $specimen;
push @specimens, 'CSF';
}
push @specimens, 'Other' if ! @specimens; # default
return join ', ', @specimens;
}
sub sample_prep { # just use 1st specimen type - too complex to consider multiple
my $specimen = shift; # p $specimen if $specimen =~ /fixed/;
if ( $specimen =~ /blood|unfixed|scroll/ ) { # p $specimen;
return 'Fresh tissue';
}
elsif ( $specimen =~ /fixed/ ) { # p $specimen; # must be AFTER 'unfixed' !!
return 'Fixed tissue';
}
elsif ( $specimen =~ /effusion/ ) { # p $specimen;
return 'Effusion';
}
elsif ( $specimen =~ /marrow/ ) { # p $specimen;
return 'Bone marrow';
}
elsif ( $specimen =~ /block/ ) { # p $specimen;
return 'FFPE tissue';
}
elsif ( $specimen =~ /slide/ ) { # p $specimen;
return 'Archived sample';
} # p $specimen;
# default:
return 'Other';
}
sub registration_lab_test_requests {
my @cols = ( 'DISTINCT t2.field_label', 1 );
my @rels = ('specimen_lab_test|t1', 't1.lab_test_id=t2.id', 'lab_tests|t2');
my @args = (
-columns => \@cols,
-from => [ -join => @rels ],
); # p @args;
my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
$dbix->dump_query($sql, @bind) if $query_output; # exit;
return $dbix->query($sql, @bind)->map;
}
sub test_request_date {
my $ref = shift; # p $ref;
my $request_id = $ref->{_request_id};
my $registered = $ref->{sample_received};
my $test_name = $ref->{test_name};
my $screened = $ref->{_screen_date};
# use manual request date if exists, or auto-request date, or date of screening:
my $manual_request_date = manual_test_request($request_id, $test_name);
# auto-requested by sample-type, therefore date = registration:
my $auto_request_date = $registration_lab_tests->{$test_name}
? $registered : 0; # zero OK as will be tested for truth below
# p [$manual_request_date, $auto_request_date, $screened];
return $manual_request_date || $auto_request_date || $screened;
}
sub manual_test_request {
my ($request_id, $test_name) = @_;
my @args = (
-columns => [ 'DATE(time)' ],
-from => [ 'request_lab_test_history' ],
-where => {
request_id => $request_id,
action => {
-rlike => '^(auto-)?requested( linked test)? ' . $test_name
},
},
-order_by => [ '-time' ], # ORDER BY time DESC
-limit => 1, # returns most recent only if >1
); # p @args;
my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
# $dbix->dump_query($sql, @bind) if $query_output; # exit;
return $dbix->query($sql, @bind)->value;
}
sub _get_main_query {
my $begin_date = $ref_date->clone->set_day(1); # first day of $ref_date month
my %date_restriction = $cumulative # if --cumulative Getopt:
? ( '>=', $begin_date->ymd ) # all since ref_date
: ( -between => [ # first & last day of ref_date month:
$begin_date->ymd, # 1st day
$begin_date->add(months => 1)->subtract(days => 1)->ymd, # last day
] );
# any change to HTS cols need updating in retrieve_sequencing_data():
my @cols = (
'p.nhs_number',
'p.first_name',
'p.last_name',
'p.dob',
'pd.post_code',
'p.gender',
'rs.display_name|referral_source',
'ref.name|referrer',
'CONCAT_WS( "/", r.request_number, LPAD(r.year - 2000, 2, 0) )
AS local_sample_id',
q!MAX( CASE WHEN ao.option_name = 'private' THEN 'Private'
ELSE 'NHS' END ) AS service_level!,
'DATE(rsd.specimen_date)|specimen_collection_date',
'NULL|test_requested', # defined in get_test_request_date()
'DATE(r.created_at)|sample_received',
q!MAX( CASE WHEN rsv.action = 'authorised' THEN DATE(rsv.time) END )
AS report_date!,
'rrd.clinical_details|clinical_indication',
'NULL|sample_type', # defined in sample_type()
'rsd.biopsy_site|sample_origin',
# 'NULL|percentage_tumour', # not supported
'NULL|sample_prep', # defined in sample_prep()
'ls.section_name|test_method',
# 'NULL|test_scope', # not supported
'tr.result|test_result',
'lt.field_label|test_name', # gene/protein
'NULL|cdna_change', # defined in retrieve_sequencing_data()
'NULL|protein_impact', # defined in retrieve_sequencing_data()
# 'NULL|karyotype', # not supported
'NULL|consequence', # defined in retrieve_sequencing_data()
'NULL|refseq_transcript', # defined in retrieve_sequencing_data()
'NULL|cosmic_id', # defined in retrieve_sequencing_data()
'NULL|vaf', # defined in retrieve_sequencing_data()
'rrs.results_summary',
# private (underscored) fields for data processing only:
'r.id|_request_id',
'r.request_number|_request_number',
'r.year|_year',
q!GROUP_CONCAT(DISTINCT s.description SEPARATOR '; ')|_specimen!,
q!MAX( CASE WHEN rsv.action = 'screened' THEN DATE(rsv.time) END )
AS _screen_date!,
);
my @rels = ( 'requests|r' ,
q{r.patient_case_id=pc.id} => 'patient_case|pc' ,
q{pc.patient_id=p.id} => 'patients|p' ,
q{pc.referral_source_id=rs.id} => 'referral_sources|rs' ,
q{rs.parent_organisation_id=po.id} => 'parent_organisations|po' ,
q{r.referrer_department_id=rd.id} => 'referrer_department|rd' ,
q{rd.referrer_id=ref.id} => 'referrers|ref' ,
q{rsp.request_id=r.id} => 'request_specimen|rsp' ,
q{rsp.specimen_id=s.id} => 'specimens|s' ,
q{rsd.request_id=r.id} => 'request_specimen_detail|rsd' ,
q{rrd.request_id=r.id} => 'request_report_detail|rrd' ,
q{tr.request_id=r.id} => 'request_lab_test_results|tr' ,
q{tr.lab_test_id=lt.id} => 'lab_tests|lt' ,
q{lt.lab_section_id=ls.id} => 'lab_sections|ls' ,
q{rsv.request_id=r.id} => 'request_status_view|rsv' ,
# left joins:
q{=>rt.request_id=r.id} => 'request_trial|rt' ,
q{=>ro.request_id=r.id} => 'request_option|ro' ,
q{=>ro.option_id=ao.id} => 'additional_options|ao' ,
q{=>pd.patient_id=p.id} => 'patient_demographics|pd' ,
q{=>rrs.request_id=r.id,rrs.lab_section_id=ls.id}
=> 'request_result_summaries|rrs' ,
);
my %where = (
'rt.request_id' => undef, # not in clinical trial
'DATE(tr.time)' => \%date_restriction, # test result date
'ls.section_name' => { -in => \@lab_sections },
'lt.test_name' => { -not_rlike => $excluded_test_name_re },
# to restrict to specific request id's:
# 'r.id' => { -in => [369247,369248] },
);
my @args = (
-columns => \@cols,
-from => [ -join => @rels ],
-where => \%where,
-group_by => ['r.id', 'lt.id'],
-order_by => 'r.id',
#-limit => 100,
#-offset => 100,
); # p @args;
my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
$dbix->dump_query($sql, @bind) if $query_output; # exit;
return ($sql, @bind);
}
sub runtimer { say sprintf "$_[0]: %.2f sec", tv_interval $t0, [gettimeofday] }