RSS Git Download  Clone
Raw Blame History
#!/usr/bin/env perl

=begin -------------------------------------------------------------------------
NGIS data extraction; require ngis-diagnosis.pl & ngis-test-directory.pl running 1st

maps hilis4 lab_test_id to NGIS test id using hilis4.ngis_lab_test table:
if lab_test_id entry in ngis_lab_test table:
* if single row then use ngis_test_code (diagnosis doesn't matter so request
    doesn't need to be reported)
* if lab_test has multiple ngis_test_codes, diagnosis.ngis_indiation_id is used
    to determine which (if any) ngis_test_code to use, where the diagnosis
    clinical indication must match one of the ngis test id's (eg HTS myeloid
    panel has matching codes for MDS, MPN, etc but not for CML)
complications (see geneq_data_extract.log):
* many hilis4 lab-tests do not yet have ngis test id's
* many lab tests do not have a clinical indication mapped to the diagnosis
* many diagnostic terms do not have any clinical indications (eg 'see comments',
    'lab tests only', etc)
* some requests not reported when data collected
eg: multiple NGIS ids for 'HTS myeloid' and 'See comments' has no clinical
    indication id
=cut
#-------------------------------------------------------------------------------

use Getopt::Std;
getopts('m:tq'); # months, testing, query output
our($opt_m,$opt_t,$opt_q); # warn $opt_t; exit;

use strict;
use warnings;
use feature 'say';

my $JUST_TESTING = $opt_t || 0; # dumps xls file in /tmp only

#==============================================================================
my $duration = $opt_m || 1; # months;
#==============================================================================

use lib '/home/raj/perl5/lib/perl5';
use IO::All;
use Data::Printer;
use SQL::Abstract::More;
use List::Util qw(maxstr);
use Spreadsheet::WriteExcel::Simple;
use Time::HiRes qw(gettimeofday tv_interval);

use FindBin qw($Bin); # warn $Bin;
use lib $Bin . '/../../lib';
use LIMS::Local::ScriptHelpers;

use lib '/home/raj/perl-lib';
use Local::DB;

my $tools = LIMS::Local::ScriptHelpers->new();
$tools->test_only($JUST_TESTING);

my $today = $tools->time_now;
my $dbix  = Local::DB->dbix({ dbname => 'hilis4' });
$dbix->lc_columns = 0; # preserve mixed case on col names (now standardised)

$Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 1; # or will defeat DATE(col)

my $log = $Bin . '/geneq_data_extract.log';
io($log)->print; # reset

my %STASH; # for logging/debugging

#-------------------------------------------------------------------------------
my $filename = $Bin.'/geneq_data.xls';
my $ref_date = $today->clone->subtract( months => 1 ); # warn $ref_date;

# check cytogenetics
my @lab_sections = (
    'FISH', 'Molecular', 'Micro-array', 'High-throughput sequencing',
    'Multiplex Ligation-dependent Probe Amplification'
);
my @excluded_lab_tests = qw(
    cell_selection_quality
    sanger_sequencing
    refer_material
    h_and_e_fish
    fish_h_and_e
    dna_control
    abl_control
);
my @excluded_parent_codes = (
	'RWM', 	  # Cardiff
	'RVA', 	  # Carmarthen
	'RT8', 	  # Conwy
	'RVF', 	  # Gwent
	'RT9', 	  # NE Wales
	'RT7', 	  # NW Wales
	'RVC', 	  # Swansea
	'RQF', 	  # Velidre, Wales
	'8EQ15',  # Drs Lab
	'MSDMDH', # Mater Dei
	'X99999', 'V81999' # unknowns
);
my $registration_lab_tests = get_registration_lab_test_requests(); # p $registration_lab_tests;

my $ngis_lab_tests = get_ngis_lab_tests();  # p $ngis_lab_tests;
#-------------------------------------------------------------------------------

my ($sql, @bind) = _get_query(); # p $sql; p @bind;
my $t0 = [gettimeofday];

my $query = $dbix->query( $sql, @bind ); runtimer('query runtime');

# get cols from query, except 'private' ones used for evaluations (eg _date_screened):
my @headers = grep $_ !~ /^_/, $query->columns; # p @headers;

my $xl = Spreadsheet::WriteExcel::Simple->new;
$xl->write_bold_row(\@headers);

runtimer('commencing data processing');
while ( my $ref = $query->hash ) {
    my $data = process_data($ref); # returns arrayref
	$xl->write_row($data);
}
runtimer('completed data processing');

$xl->save($filename);

io($log)->appendln('=' x 40 . 'LAB TESTS WITH NO NGIS ID' . '=' x 40);
io($log)->appendln($_) for sort keys %{ $STASH{no_ngis_id} };
io($log)->appendln('=' x 40 . 'DIAGNOSES WITH NO CLINICAL INDICATION' . '=' x 40);
io($log)->appendln($_) for sort keys %{ $STASH{no_clinical_indication} };
#===============================================================================
sub runtimer { say sprintf "$_[0]: %.2f sec", tv_interval $t0, [gettimeofday] }

sub process_data {
    my $ref = shift; # href

    # NGIS test code:
    $ref->{test_id} = get_ngis_test_code($ref);

    # determine date lab-test requested:
    $ref->{date_requested} = get_test_request_date($ref);

    # set sample_processed_date to date of test completion:
    $ref->{sample_processed_date} = $ref->{_date_test_completed};

    # set sample_completed_date to later of authorisation date or test
    # completion date; value will be null if request not yet authorised:
        # p $ref->{sample_processed_date}; p $ref->{_date_authorised};
    if ( $ref->{_date_authorised} ) {
        my @dates = @{$ref}{ qw/_date_authorised sample_processed_date/ };
        # dates in yyyy-mm-dd format can be handled as strings for determination
        # of chronology using List::Util::maxstr or could do $dateA gt $dateB:
        $ref->{sample_completed_date} = maxstr(@dates); # List::Util
    } # else sample_completed_date remains null

    # new template wants some cols temporarily NULL:
    my @null_fields = qw/
        testing_laboratory  patient_first_name  patient_last_name  patient_dob
        nhs_number  patient_address  patient_postcode  referrer_last_name
        referring_facility_name  unit_number  referral_reason
        time_sample_collected  infection_risk  sample_units  sample_amount
        dna_concentration  sample_tube_id  sample_rack_id
    /;
    $ref->{$_} = undef for @null_fields;

    return [ @{$ref}{@headers} ];
}

sub _get_query {
    my @cols = (
        'CONCAT_WS("/",  r.request_number, r.year - 2000)|local_sample_id',
        'NULL|local_sample_id_sub',
        'NULL|local_dna_number',
        'NULL|local_dna_number_sub',
        'p.id|internal_patient_id',
        'r.id|internal_request_id',
        q!"GeNEQ Leeds"|processing_laboratory!,
        q!"GeNEQ Leeds"|testing_laboratory!,
        'NULL|ped_number',
        'p.first_name|patient_first_name',
        'p.last_name|patient_last_name',
        'p.dob|patient_dob',
        'p.gender|patient_sex',
        'p.nhs_number',
        'pd.address|patient_address',
        'NULL|patient_district_of_residence',
        'NULL|patient_city',
        'pd.post_code|patient_postcode',
        'NULL|patient_country',
        'ref.national_code|referrer_ods_code',
        'NULL|referrer_email',
        'NULL|referrer_telephone',
        'NULL|referrer_title',
        'NULL|referrer_first_name',
        'ref.name|referrer_last_name',
        'hd.display_name|referrer_specialty',
        'rs.display_name|referring_facility_name',
        'NULL|referrer_address',
        'NULL|referrer_city',
        'NULL|referrer_district',
        'NULL|referrer_postcode',
        'NULL|referrer_county',
        'rs.organisation_code|referring_facility_ods_code',
        'NULLIF( pc.unit_number, "UNKNOWN")|unit_number', # NULL if a = b
        'NULL|other_unit_number',
        'IFNULL(rrv.clinical_details, "DiagnosticTest")|referral_reason', # a or b
        q!"Routine"|test_priority!,
        'NULL|test_id', # NGIS test id (eg M80.6)
        'lt.field_label|test_name',
        'NULL|ngis_id', # NGIS order number ?
        'NULL|date_requested', # date lab-test requested - calculated later
        'lt.test_type|request_comments', # panel or test
        q!GROUP_CONCAT(DISTINCT s.description SEPARATOR '; ')
            AS sample_type_received!,
        'rsd.specimen_date|datetime_sample_collected',  # datetime
        'r.created_at|datetime_sample_received',        # datetime
        'rsd.specimen_quality|sample_condition',
        q!MAX( CASE WHEN ao.option_name = 'doi'
            THEN 'yes' ELSE 'no' END ) AS infection_risk!,
        'NULL|sample_type_dispatched',
        'IF( rst.volume, "ul", NULL )|sample_units',
        'rst.volume|sample_amount',
        q!IF( rst.concentration < 9999, rst.concentration, NULL )
            AS dna_concentration!,
        'NULL|sample_date_sent',
        'rst.vialId|sample_tube_id',
        'sr.plateId|sample_rack_id',
        'NULL|sample_courier',
        'NULL|sample_courier_ref',
        'NULL|sample_processed_date',  # set later
        'NULL|sample_completed_date',  # set later
    # fields used for data munging only (underscored for omission in @headers):
        'lt.id|_lab_test_id',
        'd.name|_diagnosis',
        'nci.code|_clinical_indication_code',
        'rrv.diagnosis_id|_diagnosis_id',
		q!MAX( CASE WHEN rsv.action = 'screened' THEN DATE(rsv.time) END )
            AS _date_screened!,
		q!MAX( CASE WHEN rsv.action = 'reported' THEN DATE(rsv.time) END )
            AS _date_reported!,
		q!MAX( CASE WHEN rsv.action = 'authorised' THEN DATE(rsv.time) END )
            AS _date_authorised!,
        'DATE(ts.time)|_date_test_completed', # date lab-test set to complete
    );
    my @rels = (                                 'requests|r'                  ,
        q{r.patient_case_id=pc.id}           => 'patient_case|pc'              ,
        q{pc.patient_id=p.id}                => 'patients|p'                   ,
        q{pc.referral_source_id=rs.id}       => 'referral_sources|rs'          ,
        q{rs.parent_organisation_id=po.id}   => 'parent_organisations|po'      ,
        q{r.referrer_department_id=rd.id}    => 'referrer_department|rd'       ,
        q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd'      ,
        q{rd.referrer_id=ref.id}		     => 'referrers|ref'                ,
        q{rsd.request_id=r.id}               => 'request_specimen_detail|rsd'  ,
        q{rsp.request_id=r.id}               => 'request_specimen|rsp'         ,
        q{rsp.specimen_id=s.id}              => 'specimens|s'                  ,
        q{ts.request_id=r.id}                => 'request_lab_test_status|ts'   ,
        q{ts.status_option_id=so.id}         => 'lab_test_status_options|so'   ,
        q{ts.lab_test_id=lt.id}              => 'lab_tests|lt'                 ,
        q{lt.lab_section_id=ls.id}           => 'lab_sections|ls'              ,
        q{rsv.request_id=r.id}               => 'request_status_view|rsv'      ,
      # left joins:
        q{=>rrv.request_id=r.id}             => 'request_report_view|rrv'      ,
        q{=>rrv.diagnosis_id=d.id}           => 'diagnoses|d'                  ,
        q{=>d.ngis_indication_id=nci.id}     => 'ngis_clinical_indications|nci',
        q{=>ro.request_id=r.id}              => 'request_option|ro'            ,
        q{=>ro.option_id=ao.id}              => 'additional_options|ao'        ,
        q{=>pd.patient_id=p.id}              => 'patient_demographics|pd'      ,
        q{=>rst.request_id=r.id}             => 'request_storage|rst'          ,
        q{=>rst.rack_id=sr.id}               => 'storage_racks|sr'             ,
        q{=>rt.request_id=r.id}              => 'request_trial|rt'             ,
    );
    my %where = (
        -and => [ # repeated elements (test_name, parent_code), requires arayref
            'rt.request_id'        => undef,      # not in clinical trial
            'so.description'       => 'complete', # lab-test status
            'MONTH(ts.time)'       => $ref_date->month,  # when test status = complete
            'YEAR(ts.time)'        => $ref_date->year,   # when test status = complete
            'ls.section_name'      => { -in => \@lab_sections },
            'rs.organisation_code' => { -not_rlike => '^NT' },    # private hospital
            'po.parent_code'       => { -not_rlike => '^(S|Z)' }, # scotland/ireland
            'po.parent_code'       => { -not_in => \@excluded_parent_codes },
            'lt.test_name'         => { -not_in => \@excluded_lab_tests },
            'lt.test_name'   => { -not_rlike => 'quantification|store_[dr]na' },
        ],
    );
    my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!;

    my @args = (
		-columns  => \@cols,
		-from     => [ -join => @rels ],
		-where    => \%where,
        -group_by => [ 'lt.id', 'r.id' ],
        -having   => { $is_private => undef }, # not private patient
        -order_by => 'r.id',
    ); # p @args;
    # need to supply own injection_guard to override built-in sql injection attack
    # prevention - detects ';' in group_concat; modified from SQL::Abstract new()
    my @args_to_new = ( injection_guard => qr/^ \s* go \s/mi );
    my ($sql, @bind) = SQL::Abstract::More->new(@args_to_new)->select(@args); # p $sql; p \@bind;
         $dbix->dump_query($sql, @bind) if $opt_q; # exit;
    return ($sql, @bind);
}

sub get_test_request_date {
    my $ref = shift; # p $ref;

    my $request_id = $ref->{internal_request_id};
    my $registered = $ref->{date_sample_received};
    my $test_name  = $ref->{test_name};
    my $screened   = $ref->{_date_screened};

    # use manual request date if exists, or auto-request date, or date of screening:
    my $manual_request_date = get_manual_test_request($request_id, $test_name);
    # auto-requested by sample-type, therefore date = registration:
    my $auto_request_date = $registration_lab_tests->{$test_name}
        ? $registered : 0; # zero OK as will be tested for truth below
        # p [$manual_request_date, $auto_request_date, $screened];

    return $manual_request_date || $auto_request_date || $screened;
}

sub get_registration_lab_test_requests {
    my @cols = ( 'DISTINCT t2.field_label', 1 );
    my @rels = ('specimen_lab_test|t1', 't1.lab_test_id=t2.id', 'lab_tests|t2');
    my @args = (
		-columns  => \@cols,
		-from     => [ -join => @rels ],
    ); # p @args;
    my ($sql, @bind) = SQL::Abstract::More->new->select(@args); # p $sql; p \@bind;
         $dbix->dump_query($sql, @bind) if $opt_q; # exit;
    return $dbix->query($sql, @bind)->map;
}

sub get_manual_test_request {
    my ($request_id, $test_name) = @_;

    my @args = (
		-columns  => [ 'DATE(time)' ],
		-from     => [ 'request_lab_test_history' ],
        -where    => {
            request_id => $request_id,
            action => {
                -rlike => '^(auto-)?requested( linked test)? ' . $test_name
            },
        },
        -order_by => [ '-time' ], # ORDER BY time DESC
        -limit    => 1,           # returns most recent only if >1
    ); # p @args;

    my ($sql, @bind) = SQL::Abstract::More->new->select(@args); # p $sql; p \@bind;
        # $dbix->dump_query($sql, @bind) if $opt_q; # exit;
    return $dbix->query($sql, @bind)->value;
}

sub get_ngis_test_code {
    my $ref = shift;

    my $lab_test_id  = $ref->{_lab_test_id};
    my $diagnosis_id = $ref->{_diagnosis_id};
    my $test_name    = $ref->{test_name};
    my $diagnosis    = $ref->{_diagnosis} || '';
    my $lab_num      = $ref->{local_sample_id};

  # get NGIS id(s) for this lab-test:
    my $ngis_test_code = $ngis_lab_tests->{$lab_test_id}; # p $ngis_ids; # aref
    if (not $ngis_test_code) {
        $STASH{no_ngis_code}{$test_name}++;
        # say "$lab_num: no NGIS id for $test_name";
        return undef;
    }
  # return NGIS test code if it's the only one:
    return $ngis_test_code->[0] if scalar @$ngis_test_code == 1;

  # now have multiple NGIS id's for lab-test
    # common message for log:
    my $msg = qq!$lab_num: multiple NGIS codes for '$test_name'!;

  # now need to examine diagnosis:
    if (not $diagnosis_id) {
        io($log)->appendln("$msg and request is not reported");
        return undef;
    }
  # get clinical indication for diagnosis (eg AML NOS = M80)
    my $indication = $ref->{_clinical_indication_code}; # p $indication;
    if (not $indication) {
        io($log)->appendln("$msg and '$diagnosis' has no clinical indication id");
        $STASH{no_clinical_indication}{$diagnosis}++;
        return undef;
    } # p $indication; p $ngis_ids;

  # return NGIS code from list if has a matching clinical indication:
    for my $ngis_code (@$ngis_test_code) {
        # say "$lab_num: $indication -> $ngis_code" if $ngis_code =~ /^$indication/;
        return $ngis_code if $ngis_code =~ /^$indication/;
    }
  # can't find an NGIS id for lab-test:
    io($log)->appendln("$msg and diagnosis indication $indication [$diagnosis] "
        . "did not match any available NGIS test codes");
    # say "no NGIS id available for $test_name";
    return undef;
}

sub get_ngis_lab_tests {
    my $q = $dbix->select('ngis_lab_test', [qw/lab_test_id ngis_test_code/]);
    my %h;
    while ( my $ref = $q->array ) { # p $ref;
        my $lab_test_id = $ref->[0];
        push @{ $h{$lab_test_id} }, $ref->[1];
    }
    return \%h;
}