#!/usr/bin/env perl =begin ------------------------------------------------------------------------- NGIS data extraction; check ngis_* tables up-to-date runs at 6am on 1st & 3rd Friday of month via cron: 0 6 1-7 * * [ "$(date '+\%a')" = "Fri" ] && $MULTI/$0 ... 0 6 15-21 * * [ "$(date '+\%a')" = "Fri" ] && $MULTI/$0 ... manual run: $0 options: -m = months -t = testing: saves files locally, doesn't email -q = query output --cumulative = all requests since 1st of month of ref date --console = verbose log to console --verbose = verbose log to file (overrides console) --reqId 10 --reqId 100 --reqId 10000, etc maps hilis4 lab tests to an NGIS test code using hilis4.ngis_lab_test table: if lab_test_id entry(s) exist in ngis_lab_test table: * if single row then use ngis_test_code value (diagnosis doesn't matter so request doesn't need to be reported) * if lab_test has multiple ngis_test_codes, the diagnosis clinical indication code (ngis_diagnosis.ngis_indication_id) is used to determine which (if any) ngis_test_code to use, where the diagnosis clinical indication must match one of the available ngis test id's eg HTS myeloid test has matching codes for a diagnosis of MDS, MPN, etc but not for CML * if NGIS test code cannot be found on diagnosis, a loose matching on presentation is attempted, including previous screen terms used, eg 'MDS' matches 'Suspected MDS', 'Follow-up MDS' * last resort is to force NGIS test code onto some test names (eg HTS myeloid) * recipient does not want technology (FISH, PCR, HTS, etc) so ETV6/RUNX1 (FISH) & ETV6-RUNX1 (PCR) potentially ambiguous * molecular tests are reassembled into panels (if panel has NGIS code) * FISH panels without NGIS codes are expanded to individual tests TODO: for molecular panels when 1 or more tests set complete in different months, panel not recognised and separate test codes returned eg 20536/19 complications (see geneq_data_extract.log): * hilis4 lab-tests/panels without ngis test id's - require default codes * lab tests/panels without a clinical indication mapped to diagnosis or screen * diagnostic terms without clinical indication (eg 'see comments', 'lab tests only', etc) * requests not reported when data collected =cut #------------------------------------------------------------------------------- #=============================================================================== my @data_recipients = qw( paul.mcintosh john.fraser raj ); my @log_recipients = qw( turner talley raj ); #=============================================================================== # new template wants some cols temporarily NULL (commented out in query now): my @null_fields = qw//; my $query_output = 0; # --query|q - output sql queries to console my $log_console = 0; # --console - verbose log to console my $verbose_log = 0; # --verbose - verbose log to file (overrides --console) my $cumulative = 0; # --cumulative - all requests since 1st of month of ref_date my $duration = 1; # --month|m - default unless specified in command-line opts my $testing = 0; # --testing|t - saves file locally, doesn't email my @REQUEST_IDS = (); # --request_id 1 --request_id 2, etc use Getopt::Long; GetOptions ( "months|m=i" => \$duration, # int "testing|t" => \$testing, # flag "query|q" => \$query_output, # flag "console" => \$log_console, # flag "verbose" => \$verbose_log, # flag "cumulative" => \$cumulative, # flag "reqId=i" => \@REQUEST_IDS, # int list (optional) ); # warn $duration; warn $cumulative; exit; use strict; use warnings; my $JUST_TESTING = $testing || 0; use lib '/home/raj/perl5/lib/perl5'; use FindBin qw($Bin); # warn $Bin; use Data::Printer; use Log::Any qw($log); use Log::Any::Adapter; use SQL::Abstract::More; use List::Util qw(maxstr); use Spreadsheet::WriteExcel::Simple; use lib $Bin . '/../../../lib'; use LIMS::Local::ScriptHelpers; use LIMS::Local::NGIS; use lib '/home/raj/perl-lib'; use Local::WriteExcel; use Local::DB; my $tools = LIMS::Local::ScriptHelpers->new(); $tools->test_only($JUST_TESTING); my $config = $tools->config(); my $today = $tools->time_now; my $dbix = Local::DB->dbix({ dbname => 'hilis4' }); $dbix->lc_columns = 0; # preserve mixed case on col names (now standardised) # get NGIS object: my $ngis = do { my %args = ( dbix => $dbix, query_output => $query_output, test_mode => $JUST_TESTING ); LIMS::Local::NGIS->new(%args); }; # p $ngis; $Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 1; # or will defeat DATE(col) #------------------------------------------------------------------------------- my $ref_date = $today->clone->subtract( months => $duration ); # warn $ref_date; my $filename = sprintf 'GeNEQ_HMDS_data_%s.xls', $ref_date->strftime('%b_%Y'); # warn $filename; my $subject = sprintf 'GeNEQ HMDS data %s', $ref_date->strftime('%b %Y'); my $logfile = sprintf '%s/geneq_data_extract_%s.log', $Bin . '/../../../logs/geneq', lc $ref_date->strftime('%b_%Y'); # warn $logfile; # for verbose logging to console (can be set or reset later): Log::Any::Adapter->set('Stdout') if $log_console; # for verbose logging to logfile (can be set or reset later): Log::Any::Adapter->set(File => $logfile) if $verbose_log; $ngis->set_t0(); # start timer for data processing my $query = do { my ($sql, @bind) = _get_main_query(); # p $sql; p @bind; $dbix->query( $sql, @bind ); }; { # retrieve data from query and populate $ngis->data: my $data = $query->hashes; $ngis->runtimer('query runtime'); # p $data; exit; # pre-process data to reconstitute (some) molecular panels and split (some) # FISH panels, stores data inside $ngis object: $ngis->preprocess_data($data); } my $xl = Spreadsheet::WriteExcel::Simple->new; my @headers = grep $_ !~ /^_/, $query->columns; # p @headers; $xl->write_bold_row(\@headers); # store in ngis object for data generation: $ngis->col_headers(\@headers); $ngis->null_fields(\@null_fields); # retrieve preprocessed data from $ngis: my @data = @{$ngis->data}; # p @data; # AoH for my $ref (@data) { # p $ref; my $row = process_data($ref); # returns arrayref $xl->write_row($row); } $ngis->runtimer('completed data processing'); if ($JUST_TESTING) { $xl->save(join '/', $Bin, $filename); } else { my %mail = ( attachment => $xl->data, filename => $filename, subject => $subject, config => $config, ); $tools->send_mail(\%mail, \@data_recipients); } # log all misfits: _do_log(); #=============================================================================== sub process_data { my $ref = shift; # p $ref; # href # determine date lab-test requested (replaced by datetime_order_received): # $ref->{datetime_test_requested} = get_test_request_date($ref); # will need to be revived # set datetime_testing_complete to test completion datetime: $ref->{datetime_testing_complete} = $ref->{_datetime_test_completed}; { # set datetime_reporting_complete to later of testing_complete or authorisation: if ( $ref->{_datetime_authorised} ) { my @dates = @{$ref}{ qw/_datetime_authorised _datetime_test_completed/ }; # dates in yyyy-mm-dd format can be handled as strings for determination # of chronology using List::Util::maxstr or could do $dateA gt $dateB: $ref->{datetime_reporting_complete} = maxstr(@dates); # List::Util } else { # not authorised, set to _datetime_test_completed: $ref->{datetime_reporting_complete} = $ref->{_datetime_test_completed} } } =begin # datetime_processing_complete now set to D/RNA extraction date # set datetime_processing_complete to later of authorisation date or test # completion date; value will be null if request not yet authorised: # p $ref->{sample_processed_date}; p $ref->{_datetime_authorised}; if ( $ref->{_datetime_authorised} ) { my @dates = @{$ref}{ qw/_datetime_authorised datetime_testing_complete/ }; # dates in yyyy-mm-dd format can be handled as strings for determination # of chronology using List::Util::maxstr or could do $dateA gt $dateB: $ref->{datetime_processing_complete} = maxstr(@dates); # List::Util } # else sample_completed_date remains null =cut my $data = $ngis->finalise_data($ref); # adds some new fields, sets others undef return $data; } sub _main_query_cols { my $local_lab_name = $ngis->local_lab_name; # p $local_lab_name; my @cols = ( 'CONCAT_WS("/", r.request_number, r.year - 2000)|local_sample_id', # 'NULL|local_sample_id_sub', # 'NULL|local_dna_number', # 'NULL|local_dna_number_sub', 'p.id|internal_patient_id', 'r.id|internal_request_id', qq!"$local_lab_name"|booking_laboratory!, qq!"$local_lab_name"|processing_laboratory!, qq!"$local_lab_name"|testing_laboratory!, qq!"$local_lab_name"|reporting_laboratory!, # 'NULL|ped_number', # 'p.first_name|patient_first_name', # 'p.last_name|patient_last_name', # 'p.dob|patient_dob', 'p.gender|patient_sex', # 'p.nhs_number', # 'pd.address|patient_address', # 'NULL|patient_district_of_residence', 'NULL|patient_city', # 'pd.post_code|patient_postcode', # 'NULL|patient_country', 'ref.national_code|referrer_ods_code', # 'NULL|referrer_email', # 'NULL|referrer_telephone', # 'NULL|referrer_title', # 'NULL|referrer_first_name', # 'ref.name|referrer_last_name', 'hd.display_name|referrer_specialty', # required spelling !! # 'rs.display_name|referring_facility_name', # 'NULL|referrer_address', # 'NULL|referrer_city', # 'NULL|referrer_district', # 'NULL|referrer_postcode', # 'NULL|referrer_county', 'rs.organisation_code|referring_facility_ods_code', # 'NULLIF( pc.unit_number, "UNKNOWN")|unit_number', # NULL if a = b # 'NULL|other_unit_number', # 'IFNULL(rrv.clinical_details, "DiagnosticTest")|referral_reason', # a or b q!"Routine"|test_priority!, 'NULL|test_id', # NGIS test id (eg M80.6) 'lt.field_label|test_name', # 'ls.section_name|lab_section', 'NULL|ngis_id', # NGIS order number ? # 'NULL|datetime_test_requested', # replaced by datetime_order_received # 'lt.test_type|request_comments', # panel or test q!GROUP_CONCAT(DISTINCT s.description SEPARATOR '; ') AS sample_type_received!, 'rsd.specimen_date|datetime_sample_collected', # datetime 'r.created_at|datetime_sample_received', # datetime 'r.created_at|datetime_order_received', # datetime 'NULL|datetime_processing_complete', # set later 'NULL|datetime_testing_complete', # set later 'NULL|datetime_reporting_complete', # set later 'rsd.specimen_quality|sample_condition', # q!MAX( CASE WHEN ao.option_name = 'doi' # THEN 'yes' ELSE 'no' END ) AS infection_risk!, # 'NULL|sample_type_dispatched', # 'IF( rst.volume, "ul", NULL )|sample_units', # 'rst.volume|sample_amount', # q!IF( rst.concentration < 9999, rst.concentration, NULL ) # AS dna_concentration!, # 'NULL|sample_date_sent', # 'rst.vialId|sample_tube_id', # 'sr.plateId|sample_rack_id', # 'NULL|sample_courier', # 'NULL|sample_courier_ref', # 'NULL|sample_processed_date', # renamed datetime_processing_complete # 'NULL|sample_completed_date', # renamed datetime_testing_complete # fields used for data processing only (underscored for omission in @headers): 'lt.id|_lab_test_id', 'lt.test_type|_test_type', 'lt.test_name|_test_name', 'ls.section_name|_section_name', 'd.name|_diagnosis', 'nci.code|_clinical_indication_code', 's2.description|_presentation', 'rrv.diagnosis_id|_diagnosis_id', # q!MAX( CASE WHEN rsv.action = 'screened' THEN rsv.time END ) # AS _datetime_screened!, # not used # q!MAX( CASE WHEN rsv.action = 'reported' THEN rsv.time END ) # AS _datetime_reported!, # not used q!MAX( CASE WHEN rsv.action = 'authorised' THEN rsv.time END ) AS _datetime_authorised!, 'ts.time|_datetime_test_completed', # date lab-test set to complete ); return wantarray ? @cols : \@cols; } sub _get_main_query { my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!; my $begin_date = $ref_date->clone->set_day(1); # first day of $ref_date month my %date_restriction = $cumulative # if --cumulative Getopt: ? ( '>=', $begin_date->ymd ) # all since ref_date : ( -between => [ # first & last day of ref_date month: $begin_date->ymd, # 1st day $begin_date->add(months => 1)->subtract(days => 1)->ymd, # last day ] ); my $excluded_lab_test_ids = $ngis->excluded_lab_test_ids; my $excluded_parent_codes = $ngis->excluded_parent_codes; my $excluded_test_name_re = $ngis->excluded_test_names; my $lab_sections = $ngis->included_lab_sections; my $cols = _main_query_cols(); # these keep changing my @rels = ( 'requests|r' , q{r.patient_case_id=pc.id} => 'patient_case|pc' , q{pc.patient_id=p.id} => 'patients|p' , q{pc.referral_source_id=rs.id} => 'referral_sources|rs' , q{rs.parent_organisation_id=po.id} => 'parent_organisations|po' , q{r.referrer_department_id=rd.id} => 'referrer_department|rd' , q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd' , q{rd.referrer_id=ref.id} => 'referrers|ref' , q{rsd.request_id=r.id} => 'request_specimen_detail|rsd' , q{rsp.request_id=r.id} => 'request_specimen|rsp' , q{rsp.specimen_id=s.id} => 'specimens|s' , q{ts.request_id=r.id} => 'request_lab_test_status|ts' , q{ts.status_option_id=so.id} => 'lab_test_status_options|so' , q{ts.lab_test_id=lt.id} => 'lab_tests|lt' , q{lt.lab_section_id=ls.id} => 'lab_sections|ls' , q{rsv.request_id=r.id} => 'request_status_view|rsv' , q{ris.request_id=r.id} => 'request_initial_screen|ris' , q{ris.screen_id=s2.id} => 'screens|s2' , # left joins: q{=>rrv.request_id=r.id} => 'request_report_view|rrv' , q{=>rrv.diagnosis_id=d.id} => 'diagnoses|d' , q{=>nd.diagnosis_id=d.id} => 'ngis_diagnosis|nd' , q{=>nd.ngis_indication_id=nci.id} => 'ngis_clinical_indications|nci', q{=>ro.request_id=r.id} => 'request_option|ro' , q{=>ro.option_id=ao.id} => 'additional_options|ao' , q{=>pd.patient_id=p.id} => 'patient_demographics|pd' , q{=>rst.request_id=r.id} => 'request_storage|rst' , q{=>rst.rack_id=sr.id} => 'storage_racks|sr' , q{=>rt.request_id=r.id} => 'request_trial|rt' , ); my @where = ( # contains repeated elements (eg parent_code), so requires array 'rt.request_id' => undef, # not in clinical trial 'so.description' => 'complete', # lab-test status 'ls.section_name' => { -in => $lab_sections }, 'rs.organisation_code' => { -not_rlike => '^NT' }, # private hospital 'po.parent_code' => { -not_rlike => '^(S|Z)' }, # scotland/ireland 'po.parent_code' => { -not_in => $excluded_parent_codes }, 'lt.test_name' => { -not_rlike => $excluded_test_name_re }, 'lt.id' => { -not_in => $excluded_lab_test_ids }, ); # restrict on request id's if supplied, or date: if (@REQUEST_IDS) { push @where, ( 'r.id' => { -in => \@REQUEST_IDS } ); } else { push @where, ( 'DATE(ts.time)' => \%date_restriction ); } # p @where; my @args = ( -columns => $cols, -from => [ -join => @rels ], # contains repeated elements (eg parent_code), so requires "-and => aref" -where => { -and => \@where }, -group_by => [ 'lt.id', 'r.id' ], -having => { $is_private => undef }, # not private patient -order_by => 'r.id', #-limit => 100, #-offset => 100, ); # p @args; # need to supply our own injection_guard to override built-in sql injection # attack prevention which detects ';' in GROUP_CONCAT, but we don't need one # here as we have no user input: my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ ); my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; return ($sql, @bind); } sub _do_log { my $stash = $ngis->log_stash; # p $stash; my $no_ngis_code = $stash->{no_ngis_code}; my $unreported = $stash->{unreported}; my $no_ngis_code_requests = $stash->{no_ngis_code_requests}; my $no_clinical_indication = $stash->{no_clinical_indication}; my $no_available_ngis_code = $stash->{no_available_ngis_code}; my $filename = sprintf '%s/geneq_data_extract_log_%s.xlsx', $Bin . '/../../../logs/geneq', lc $ref_date->strftime('%b_%Y'); my $xl = Local::WriteExcel->new( filename => $filename ); # p $xl; if ($no_clinical_indication) { $xl->worksheet_name('no clinical indication'); my @cols = qw( lab_num test_name presentation diagnosis ); $xl->write_bold_row(\@cols); $xl->write_row($_) for @$no_clinical_indication; } if ($no_available_ngis_code) { $xl->worksheet_name('no available ngis test-code'); my @cols = qw( lab_num test_name ngis_codes diagnosis clinical_indication presentation ); $xl->write_bold_row(\@cols); $xl->write_row($_) for @$no_available_ngis_code; } if ($no_ngis_code_requests) { # p $no_ngis_code_requests; $xl->worksheet_name('no ngis test-code'); $xl->write_bold_row([ qw/lab_num test_name presentation diagnosis/ ] ); $xl->write_row($_) for @$no_ngis_code_requests; } if ($no_ngis_code) { # p $no_ngis_code; $xl->worksheet_name('no ngis test-code summary'); $xl->write_bold_row([ qw/test_name test_type lab_section count/ ] ); $xl->write_row([ (split ' :: ', $_ ), $no_ngis_code->{$_} ]) for sort keys %{ $no_ngis_code }; } if ($unreported) { $xl->worksheet_name('unreported'); $xl->write_bold_row([ qw/lab_num test_name presentation/] ); $xl->write_row($_) for @$unreported; } if ($JUST_TESTING) { $xl->save(); } else { my $subject = 'GeNEQ data extraction log ' . $ref_date->strftime('%b %Y'); # p $subject; # capture filename from its full path: $filename =~ /(geneq_data_extract_log_.*)/; # p $1; my %mail = ( attachment => $xl->data, filename => $1, # captured from above subject => $subject, config => $config, ); $tools->send_mail(\%mail, \@log_recipients); } { # text file: # log summary data to $logfile, override any previous adapter settings Log::Any::Adapter->set(File => $logfile); if ($no_clinical_indication) { $log->info('DIAGNOSIS & PRESENTATION WITH NO CLINICAL INDICATION'); $log->info(join ' :: ', @$_) for sort test_name_order @$no_clinical_indication; } if ($no_available_ngis_code) { # p $no_available_ngis_code; $log->info('DIAGNOSIS WITH NO AVAILABLE NGIS CODE & PRESENTATION ' . 'WITH NO CLINICAL INDICATION'); $log->info(join ' :: ', @$_) for sort test_name_order @$no_available_ngis_code; } if ($unreported) { $log->info('UNREPORTED REQUESTS'); $log->info(join ' :: ', @$_) for @$unreported; } if ($no_ngis_code) { $log->info('LAB TESTS WITH NO NGIS ID'); $log->info(join ':: ', $_, $no_ngis_code->{$_}) for sort keys %{ $no_ngis_code }; } } } sub test_name_order { # p $a; p $b; my ($req_num_a, $yr_a) = split '/', $a->[0]; my ($req_num_b, $yr_b) = split '/', $b->[0]; my $test_name_a = $a->[1]; my $test_name_b = $b->[1]; return $test_name_a cmp $test_name_b || $yr_a <=> $yr_b || $req_num_a <=> $req_num_b; } sub _test_name_order { # p $a; p $b; my ($req_num_a, $yr_a, $test_name_a) = $a =~ m!^(\d+)/(\d{2})\: (.*) \[!; # p $test_name_a; my ($req_num_b, $yr_b, $test_name_b) = $b =~ m!^(\d+)/(\d{2})\: (.*) \[!; # p $test_name_b; return $test_name_a cmp $test_name_b || $yr_a <=> $yr_b || $req_num_a <=> $req_num_b; }