#!/usr/bin/env perl =begin ------------------------------------------------------------------------- molecular data extract for NCRAS (PHE) - compliments ncrs_cosd.pl with individual test results; gets HTS data from Hydra HTS db excludes lab-tests matching: extraction, selection, quantification, control, h_and_e, refer_material, store_[dr]na excludes FISH controls probes (CENxx) with "normal" results (or failed) =cut #------------------------------------------------------------------------------- my @normal_fish_control_results = # may need updating with any new options: ('not required', 'normal signal pattern', 'fail'); #------------------------------------------------------------------------------- my $query_output = 0; # --query|q - output sql queries to console my $cumulative = 0; # --cumulative - all requests since 1st of month of ref_date my $time_this = 0; # --time_this (Time::HiRes) my $duration = 1; # month; default unless specified in command-line opts my $testing = 0; # --testing|t - save file locally, no email use Getopt::Long; GetOptions ( "cumulative" => \$cumulative, # flag "months|m=i" => \$duration, # int "testing|t" => \$testing, # flag "time_this" => \$time_this, # flag "query|q" => \$query_output, # flag ); # warn $duration; warn $cumulative; exit; use strict; use warnings; use feature 'say'; use lib '/home/raj/perl5/lib/perl5'; use FindBin qw($Bin); # warn $Bin; use Clone 'clone'; use Config::JSON; use Data::Printer; use SQL::Abstract::More; use Spreadsheet::WriteExcel::Simple; use Time::HiRes qw(gettimeofday tv_interval); use lib $Bin . '/../../../lib'; use LIMS::Local::ScriptHelpers; use lib '/home/raj/perl-lib'; use Local::DB; #=============================================================================== my @recipients = ( 'phe.ecricin.secure', 'paul.glover.secure', 'raj.secure', ); #=============================================================================== # check HTS db config file exists (not git controlled) or die: my $hts_db_config = $Bin . '/hts.cfg'; -e $hts_db_config or die 'require HTS db connection params'; my $tools = LIMS::Local::ScriptHelpers->new(); $tools->test_only($testing); # defaults to 0 my $config = $tools->config(); my $today = $tools->time_now; my $dbix = Local::DB->dbix({ dbname => 'hilis4' }); $dbix->lc_columns = 0; # preserve mixed case on col names # remote HTS database connection: my $dbix_hts = do { my $cfg = Config::JSON->new($hts_db_config); # p $cfg->config; # auto-dies on failure to load my $ref = $cfg->get('params'); # p $ref; Local::DB->dbix($ref); }; # p $dbix_hts; # need to supply our own injection_guard to override built-in sql injection # attack prevention which detects ';' in GROUP_CONCAT, but we don't need one # here as we have no user input: my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ ); $Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 1; # or will defeat DATE(col) my $ref_date = $today->clone->subtract( months => $duration ); # warn $ref_date; my $subject = sprintf 'HMDS molecular data (%s)', $ref_date->strftime('%b_%Y'); my $filename = sprintf 'hmds_molecular_data_%s.xls', lc $ref_date->strftime('%b %Y'); # warn $filename; my @lab_sections = ( 'FISH', 'Molecular', 'Micro-array', 'High-throughput sequencing', 'Multiplex Ligation-dependent Probe Amplification' ); my $excluded_test_name_re = join '|', qw( extraction selection quantification control h_and_e refer_material store_[dr]na ); # p $excluded_test_name_re; # get lab-tests auto-requested at registration: my $registration_lab_tests = registration_lab_test_requests(); # p $registration_lab_tests; my ($sql, @bind) = _get_main_query(); # p $sql; p @bind; my $t0 = [gettimeofday]; my $query = $dbix->query( $sql, @bind ); # get cols from query, except 'private' ones used for evaluations (eg _specimen): my @headers = grep $_ !~ /^_/, $query->columns; # p @headers; # my @data = $query->hashes; runtimer('query runtime') if $time_this; my $xl = Spreadsheet::WriteExcel::Simple->new; $xl->write_bold_row(\@headers); my @rows; # global for processed data runtimer('commencing data processing') if $time_this; while ( my $ref = $query->hash ) { # p $ref->{_request_id}; process_data($ref); # pushes processed data onto @rows } runtimer('completed data processing') if $time_this; $xl->write_row($_) for @rows; #=============================================================================== if ($testing) { $xl->save($Bin . '/' . $filename); } else { my %mail = ( config => $config, subject => $subject, filename => $filename, attachment => $xl->data, ); $tools->send_mail(\%mail, \@recipients); } #=============================================================================== sub process_data { my $data = shift; # skip FISH control probes with "normal" result (can't do this in query): return if $data->{test_method} eq 'FISH' && $data->{test_name} =~ /control|CEN[1-9]/ # test_name = field_label && ( ! $data->{test_result} || grep $data->{test_result} eq $_, @normal_fish_control_results ); $data->{test_requested} = test_request_date($data); { # remove single char clinical details & trailing '.', undef -> 'not recorded': no warnings 'uninitialized'; # some legacy cases (before March 2017) $data->{clinical_indication} =~ s/\.$//; # so dot-only becomes undef $data->{clinical_indication} ||= '[not recorded]'; $data->{clinical_indication} = '[not recorded]' unless length $data->{clinical_indication} > 1; } my $specimen = $data->{_specimen}; # p $sample_type; $data->{sample_type} = sample_type($specimen); $data->{sample_prep} = sample_prep($specimen); # test_method expects a technology: $data->{test_method} =~ s{Molecular}{PCR/Sanger}; # others OK (eg FISH, HTS) # get HTS params (cdna_change, protein_impact, refseq_transcript, cosmic_id) # can be >1 row per gene/probe: if ( my @data = retrieve_sequencing_data($data) ) { # AoH, or undef if not HTS # p { $data->{_request_number} => \@data }; for my $ref (@data) { # href my @col_names = keys %{$ref}; my @results = values %{$ref}; # clone $data & merge HTS results: my $c = clone $data; @{$c}{@col_names} = @results; # $c->{$_} = $ref->{$_} for keys %$ref; add_to_rows($c); } } else { # no HTS data: add_to_rows($data); } } sub add_to_rows { my $href = shift; push @rows, [ @{$href}{@headers} ] } sub retrieve_sequencing_data { # from HTS db: my $data = shift; # p $data; # skip db call unless it's an HTS test with a result: return undef unless $data->{test_method} =~ 'sequencing' && defined $data->{test_result}; my $request_number = $data->{_request_number}; my $test_name = $data->{test_name}; my $year = $data->{_year}; $test_name =~ s/(FLT3)-TKD/$1/; # HTS db uses FLT3 not FLT3-TDK $test_name =~ s/c-(KIT)/$1/; # HTS db uses KIT not c-KIT my @cols = ( # ** must match HTS cols in _get_main_query() ** 'gt.transcript_name|refseq_transcript', 'v.cNomen|cdna_change', 'v.pNomen|protein_impact', 'v.consequence', 'va.cosmic_id', 'vasr.vaf|vaf', ); my @rels = ( 'clinically_reported_variant|crv' , 'crv.clinically_reported_sample_id=crs.id' => 'clinically_reported_sample|crs' , 'crv.final_vasr_id=vasr.id' => 'variant_annotation_sample_run|vasr' , 'vasr.variant_annotation_id=va.id' => 'variant_annotation|va' , 'va.variant_id=v.id' => 'variant|v' , 'v.gene_transcript_id=gt.id' => 'gene_transcript|gt' , 'gt.gene_id=g.id' => 'gene|g' , 'crs.sample_id=s.id' => 'sample|s' , ); my %where = ( 'crv.final_reported' => 1, 's.request_number' => $request_number, 'g.gene_name' => $test_name, 's.year' => $year, ); my @args = ( -columns => \@cols, -from => [ -join => @rels ], -where => \%where, ); # p @args; my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; my @results = $dbix_hts->query($sql, @bind)->hashes; return wantarray ? @results : \@results; } sub sample_type { # record multiple sample types if provided: my $specimen = shift; my @specimens; if ( $specimen =~ /blood/ ) { # p $specimen; push @specimens, 'Blood'; } if ( $specimen =~ /bone marrow/ ) { # p $specimen; push @specimens, 'Bone marrow'; } if ( $specimen =~ /CSF/ ) { # p $specimen; push @specimens, 'CSF'; } push @specimens, 'Other' if ! @specimens; # default return join ', ', @specimens; } sub sample_prep { my $specimen = shift; # p $specimen if $specimen =~ /fixed/; my @specimens = split '; ', $specimen; my %preps; for (@specimens) { # p $_; # do 'marrow' 1st to avoid 'BM trephine, fixed' matching 'fixed': if ( /marrow/ ) { $preps{'Bone marrow'}++; } # can be fixed, unfixed, both, or 'BM trephine, fixed' (already matched): elsif ( /fixed/ ) { $preps{'Fresh tissue'}++ if /unfixed/; $preps{'Fixed tissue'}++ if /\bfixed/; } elsif ( /blood|scroll/ ) { $preps{'Fresh tissue'}++; } elsif ( /effusion/ ) { $preps{Effusion}++; } elsif ( /CSF/ ) { $preps{CSF}++; } elsif ( /block/ ) { $preps{'FFPE tissue'}++; } elsif ( /slide/ ) { $preps{'Archived sample'}++; } } my $str = join ', ', keys %preps; # say "$specimen => $str"; $str ||= 'Other'; # default if nothing matches return $str; } sub registration_lab_test_requests { my @cols = ( 'DISTINCT t2.field_label', 1 ); my @rels = ('specimen_lab_test|t1', 't1.lab_test_id=t2.id', 'lab_tests|t2'); my @args = ( -columns => \@cols, -from => [ -join => @rels ], ); # p @args; my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; return $dbix->query($sql, @bind)->map; } sub test_request_date { my $ref = shift; # p $ref; my $request_id = $ref->{_request_id}; my $registered = $ref->{sample_received}; my $test_name = $ref->{test_name}; my $screened = $ref->{_screen_date}; # use manual request date if exists, or auto-request date, or date of screening: my $manual_request_date = manual_test_request($request_id, $test_name); # auto-requested by sample-type, therefore date = registration: my $auto_request_date = $registration_lab_tests->{$test_name} ? $registered : 0; # zero OK as will be tested for truth below # p [$manual_request_date, $auto_request_date, $screened]; return $manual_request_date || $auto_request_date || $screened; } sub manual_test_request { my ($request_id, $test_name) = @_; my @args = ( -columns => [ 'DATE(time)' ], -from => [ 'request_lab_test_history' ], -where => { request_id => $request_id, action => { -rlike => '^(auto-)?requested( linked test)? ' . $test_name }, }, -order_by => [ '-time' ], # ORDER BY time DESC -limit => 1, # returns most recent only if >1 ); # p @args; my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; # $dbix->dump_query($sql, @bind) if $query_output; # exit; return $dbix->query($sql, @bind)->value; } sub _date_restriction { my $begin_date = $ref_date->clone->set_day(1); # first day of $ref_date month my %date_restriction = $cumulative # if --cumulative Getopt: ? ( '>=', $begin_date->ymd ) # all since ref_date : ( -between => [ # first & last day of ref_date month: $begin_date->ymd, # 1st day $begin_date->add(months => 1)->subtract(days => 1)->ymd, # last day ] ); return \%date_restriction; } sub _get_requests { # with test result date within required date restriction: my $date_restriction = shift; my @rels = ( 'requests|r' , q{tr.request_id=r.id} => 'request_lab_test_results|tr' , q{tr.lab_test_id=lt.id} => 'lab_tests|lt' , q{lt.lab_section_id=ls.id} => 'lab_sections|ls' , # left joins: q{=>rt.request_id=r.id} => 'request_trial|rt' , ); my %where = ( 'rt.request_id' => undef, # not in clinical trial 'DATE(tr.time)' => $date_restriction, # test result date 'ls.section_name' => { -in => \@lab_sections }, 'lt.test_name' => { -not_rlike => $excluded_test_name_re }, # to restrict to specific request id's: # 'r.id' => { -in => [369247,369248] }, ); my @args = ( -columns => [ 'DISTINCT(r.id)' ], -from => [ -join => @rels ], -where => \%where, -group_by => ['r.id', 'lt.id'], -order_by => 'r.id', #-limit => 100, #-offset => 100, ); # p @args; my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; my $requests = $dbix->query($sql, @bind)->column; # p $requests; return $requests; } sub _get_main_query { # any change to HTS cols need updating in retrieve_sequencing_data(): my @cols = ( 'p.nhs_number', 'p.first_name', 'p.last_name', 'p.dob', 'pd.post_code', 'p.gender', 'rs.display_name|referral_source', 'ref.name|referrer', 'CONCAT_WS( "/", r.request_number, LPAD(r.year - 2000, 2, 0) ) AS local_sample_id', q!MAX( CASE WHEN ao.option_name = 'private' THEN 'Private' ELSE 'NHS' END ) AS service_level!, 'DATE(rsd.specimen_date)|specimen_collection_date', 'NULL|test_requested', # defined in get_test_request_date() 'DATE(r.created_at)|sample_received', q!MAX( CASE WHEN rsv.action = 'authorised' THEN DATE(rsv.time) END ) AS report_date!, 'rrd.clinical_details|clinical_indication', 'NULL|sample_type', # defined in sample_type() 'rsd.biopsy_site|sample_origin', # 'NULL|percentage_tumour', # not supported 'NULL|sample_prep', # defined in sample_prep() 'ls.section_name|test_method', # 'NULL|test_scope', # not supported 'tr.result|test_result', 'DATE(tr.time)|result_date', q!CASE WHEN lt2.id IS NOT NULL THEN lt2.field_label ELSE lt.field_label END as test_name!, # gene/protein 'NULL|cdna_change', # defined in retrieve_sequencing_data() 'NULL|protein_impact', # defined in retrieve_sequencing_data() # 'NULL|karyotype', # not supported 'NULL|consequence', # defined in retrieve_sequencing_data() 'NULL|refseq_transcript', # defined in retrieve_sequencing_data() 'NULL|cosmic_id', # defined in retrieve_sequencing_data() 'NULL|vaf', # defined in retrieve_sequencing_data() 'rrs.results_summary', # private (underscored) fields for data processing only: 'r.id|_request_id', 'r.request_number|_request_number', 'r.year|_year', 'lt.field_label|_original_test_name', 'DATE(tr.time)|_result_date', q!CASE WHEN lt2.id IS NOT NULL THEN 'yes' ELSE 'no' END as _expansion!, q!GROUP_CONCAT(DISTINCT s.description SEPARATOR '; ')|_specimen!, q!MAX( CASE WHEN rsv.action = 'screened' THEN DATE(rsv.time) END ) AS _screen_date!, ); my $tr_join = 'CASE WHEN lt2.id IS NOT NULL THEN lt2.id ELSE lt.id END'; my @rels = ( 'requests|r' , q{r.patient_case_id=pc.id} => 'patient_case|pc' , q{pc.patient_id=p.id} => 'patients|p' , q{pc.referral_source_id=rs.id} => 'referral_sources|rs' , q{rs.parent_organisation_id=po.id} => 'parent_organisations|po' , q{r.referrer_department_id=rd.id} => 'referrer_department|rd' , q{rd.referrer_id=ref.id} => 'referrers|ref' , q{rsp.request_id=r.id} => 'request_specimen|rsp' , q{rsp.specimen_id=s.id} => 'specimens|s' , q{rsd.request_id=r.id} => 'request_specimen_detail|rsd' , q{rrd.request_id=r.id} => 'request_report_detail|rrd' , q{ts.request_id=r.id} => 'request_lab_test_status|ts' , q{ts.lab_test_id=lt.id} => 'lab_tests|lt' , q{lt.lab_section_id=ls.id} => 'lab_sections|ls' , q{rsv.request_id=r.id} => 'request_status_view|rsv' , # left joins: q{=>ro.request_id=r.id} => 'request_option|ro' , q{=>ro.option_id=ao.id} => 'additional_options|ao' , q{=>pd.patient_id=p.id} => 'patient_demographics|pd' , q{=>plt.panel_test_id=lt.id} => 'panel_lab_test|plt' , q{=>plt.lab_test_id=lt2.id} => 'lab_tests|lt2' , qq{=>tr.request_id=r.id,tr.lab_test_id=$tr_join} => 'request_lab_test_results|tr' , q{=>rrs.request_id=r.id,rrs.lab_section_id=ls.id} => 'request_result_summaries|rrs' , ); my $date_restriction = _date_restriction(); my $request_ids = _get_requests($date_restriction); # aref # [369247,369248]; # to restrict to specific request id's my %where = ( 'r.id' => { -in => $request_ids }, -or => { 'DATE(tr.time)' => $date_restriction, # test result date 'tr.id' => undef, # no lab-test result }, 'ls.section_name' => { -in => \@lab_sections }, 'lt.test_name' => { -not_rlike => $excluded_test_name_re }, # 'rs.organisation_code' => { -not_like => 'GENEQ%' }, # is now required # ? can't do this in SQLAM (not appropriate now, need to use result also): # CASE WHEN lt2.id IS NOT NULL THEN lt2.field_label NOT RLIKE 'CEN[1-9]' # ELSE lt.field_label NOT RLIKE 'CEN[1-9]' END ); my @args = ( -columns => \@cols, -from => [ -join => @rels ], -where => \%where, -group_by => [ 'r.id', 'CASE WHEN lt2.id IS NOT NULL THEN lt2.id ELSE lt.id END' ], -order_by => 'r.id', #-limit => 100, #-offset => 100, ); # p @args; my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; return ($sql, @bind); } sub runtimer { say sprintf "$_[0]: %.2f sec", tv_interval $t0, [gettimeofday] } __END__ DROP TABLE IF EXISTS _request_ids; CREATE TEMPORARY TABLE _request_ids (id INT); INSERT INTO _request_ids SELECT DISTINCT(r.id) FROM requests AS r JOIN request_lab_test_results AS tr ON ( tr.request_id = r.id ) JOIN lab_tests AS lt ON ( tr.lab_test_id = lt.id ) JOIN lab_sections AS ls ON ( lt.lab_section_id = ls.id ) LEFT JOIN request_trial AS rt ON ( rt.request_id = r.id ) WHERE ( ( ( DATE(tr.time) BETWEEN '2019-06-01' AND '2019-06-30' ) AND ls.section_name IN ( 'FISH', 'Molecular', 'Micro-array', 'High-throughput sequencing', 'Multiplex Ligation-dependent Probe Amplification' ) AND lt.test_name NOT RLIKE 'extraction|selection|quantification|control|h_and_e|refer_material|store_[dr]na' AND rt.request_id IS NULL ) ); # SELECT * FROM _request_ids; SELECT p.nhs_number, p.first_name, p.last_name, p.dob, pd.post_code, p.gender, rs.display_name AS referral_source, ref.name AS referrer, CONCAT_WS( "/", r.request_number, LPAD(r.year - 2000, 2, 0) ) AS local_sample_id, MAX( CASE WHEN ao.option_name = 'private' THEN 'Private' ELSE 'NHS' END ) AS service_level, DATE(rsd.specimen_date) AS specimen_collection_date, NULL AS test_requested, DATE(r.created_at) AS sample_received, MAX( CASE WHEN rsv.action = 'authorised' THEN DATE(rsv.time) END ) AS report_date, rrd.clinical_details AS clinical_indication, NULL AS sample_type, rsd.biopsy_site AS sample_origin, NULL AS sample_prep, lt.field_label AS _original_test_name, CASE WHEN lt2.id IS NOT NULL THEN 'yes' ELSE 'no' END as '_expansion', CASE WHEN lt2.id IS NOT NULL THEN lt2.field_label ELSE lt.field_label END as test_name, ls.section_name AS test_method, tr.result AS test_result, NULL AS cdna_change, NULL AS protein_impact, NULL AS consequence, NULL AS refseq_transcript, NULL AS cosmic_id, NULL AS vaf, rrs.results_summary, r.id AS _request_id, r.request_number AS _request_number, r.year AS _year, GROUP_CONCAT(DISTINCT s.description SEPARATOR '; ') AS _specimen, MAX( CASE WHEN rsv.action = 'screened' THEN DATE(rsv.time) END ) AS _screen_date FROM _request_ids JOIN requests r ON _request_ids.id = r.id JOIN patient_case AS pc ON ( r.patient_case_id = pc.id ) JOIN patients AS p ON ( pc.patient_id = p.id ) JOIN referral_sources AS rs ON ( pc.referral_source_id = rs.id ) JOIN parent_organisations AS po ON ( rs.parent_organisation_id = po.id ) JOIN referrer_department AS rd ON ( r.referrer_department_id = rd.id ) JOIN referrers AS ref ON ( rd.referrer_id = ref.id ) JOIN request_specimen AS rsp ON ( rsp.request_id = r.id ) JOIN specimens AS s ON ( rsp.specimen_id = s.id ) JOIN request_specimen_detail AS rsd ON ( rsd.request_id = r.id ) JOIN request_report_detail AS rrd ON ( rrd.request_id = r.id ) JOIN request_lab_test_status ts on ts.request_id = r.id JOIN lab_tests lt on ts.lab_test_id = lt.id JOIN lab_sections ls on lt.lab_section_id = ls.id JOIN request_status_view AS rsv ON ( rsv.request_id = r.id ) LEFT JOIN request_option AS ro ON ( ro.request_id = r.id ) LEFT JOIN additional_options AS ao ON ( ro.option_id = ao.id ) LEFT JOIN patient_demographics AS pd ON ( pd.patient_id = p.id ) LEFT JOIN request_result_summaries AS rrs ON ( ( rrs.lab_section_id = ls.id AND rrs.request_id = r.id ) ) LEFT JOIN panel_lab_test plt on plt.panel_test_id = lt.id LEFT JOIN lab_tests lt2 on plt.lab_test_id = lt2.id LEFT JOIN request_lab_test_results tr ON tr.request_id = r.id AND tr.lab_test_id = CASE WHEN lt2.field_label IS NOT NULL THEN lt2.id ELSE lt.id END WHERE ls.section_name IN ( 'FISH', 'Molecular', 'Micro-array', 'High-throughput sequencing', 'Multiplex Ligation-dependent Probe Amplification' ) AND lt.test_name NOT RLIKE 'extraction|selection|quantification|control|h_and_e|refer_material|store_[dr]na' /* throw away any results not between original selection dates: */ AND ( tr.time IS NULL OR DATE(tr.time) BETWEEN '2019-06-01' AND '2019-06-30' ) GROUP BY r.id, CASE WHEN lt2.id IS NOT NULL THEN lt2.id ELSE lt.id END ORDER BY r.id;