RSS Git Download  Clone
Raw Blame History
#!/usr/bin/env perl

=begin -------------------------------------------------------------------------
NGIS data extraction to csv - shares code with geneq_data_extraction.pl
usage: perl $0 [-tq]
* --query|q = output query to stdout
* --test|t = saves file locally
* --reqId 10 --reqId 100 --reqId 10000, etc

query assumes WGS where sample extracted in Genetics lab & processed at Illumina
    will need adapting if/when arrangement extended to non-WGS samples
TODO: test name = WGS Germline and Tumour, test_id depends on phenotype (AML/ALL)
TODO: what triggers extract, how often, what type of request
TODO: germline sample - either use a new sample code or have sample_type_dispatched
    entries which are not shared with diagnostic sample (skin, etc)
TODO: datetime_processing_complete is doing db query for dna extraction date in
    NGIS::get_processing_date - what result does csv file expect ?
=cut
#-------------------------------------------------------------------------------

my $query_output = 0;  # --query|q - output sql queries to console
my $testing      = 0;  # --testing|t - saves file locally, doesn't email
my @REQUEST_IDS  = (); # --request_id 1 --request_id 2, etc

my @TEST_NAMES   = qw( wgs_referral ); # tests to trigger data transfer
my @null_fields  = qw(); # force fields to null if required (temporary measure)

use Getopt::Long;
GetOptions (
    "testing|t"  => \$testing,        # flag
    "query|q"    => \$query_output,   # flag
    "reqId=i"    => \@REQUEST_IDS,    # int list (optional)
);

use strict;
use warnings;

my $JUST_TESTING = $testing || 0;

use lib '/home/raj/perl5/lib/perl5';
use FindBin qw($Bin); # warn $Bin;

use Text::CSV;
use Data::Printer;
use SQL::Abstract::More;

use lib $Bin . '/../../../lib';
use LIMS::Local::ScriptHelpers;
use LIMS::Local::NGIS;

use lib '/home/raj/perl-lib';
use Local::DB;

my $tools = LIMS::Local::ScriptHelpers->new();
$tools->test_only($JUST_TESTING);

my $config = $tools->config();
my $today  = $tools->time_now;
my $dbix   = Local::DB->dbix({ dbname => 'hilis4' }); # p $dbix;
$dbix->lc_columns = 0; # preserve mixed case on col names (now standardised)

# get NGIS object:
my $ngis = do {
    my %args = (
        dbix => $dbix, query_output => $query_output, test_mode => $JUST_TESTING
    );
    LIMS::Local::NGIS->new(%args);
}; # p $ngis;
$Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 0; # force iso8601 datetime

#-------------------------------------------------------------------------------
my $filename = sprintf 'YNEGLH_HMDS_%s_dispatch.csv',
    $today->strftime('%Y%m%d_%H%M%S'); # warn $filename;
# $filename = 'temp.csv'; # for testing

$ngis->set_t0(); # start timer for data processing

# need to supply our own injection_guard to override built-in sql injection
# attack prevention which detects ';' in GROUP_CONCAT, but we don't need one
# here as we have no user input:
my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ );

my $query = do { # _get_main_query returns false if no request_id's to process
    my ($sql, @bind) = _get_main_query(\@REQUEST_IDS); # p $sql; p @bind;
    exit unless $sql;
    $dbix->query( $sql, @bind );
};
# get cols from query, except 'private' ones used for evaluations (eg _datetime_screened):
my @headers = grep $_ !~ /^_/, $query->columns; # p @headers;

# retrieve data from query:
my $data = $query->hashes; $ngis->runtimer('query runtime'); # p $data;

# pre-process data to reconstitute (some) molecular panels and split (some)
# FISH panels, stores data inside $ngis object:
$ngis->preprocess_data($data);
# store col_headers & null_fields in ngis object for use in process_data():
$ngis->col_headers(\@headers);
$ngis->null_fields(\@null_fields);

open my $fh, ">:encoding(utf8)", "$Bin/$filename" or die $!;
my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1, eol => $/ });
$csv->print($fh, \@headers);

# retrieve pre-processed data from $ngis:
my @ngis_data = @{$ngis->data}; # p @ngis_data; # AoH
for my $ref (@ngis_data) { # p $ref;
    my $row = process_data($ref); # p $row; # arrayref
    $csv->print($fh, $row);
}
close $fh or die $!;
$ngis->runtimer('completed data processing');

if ($JUST_TESTING) {

}
else { # transfer file using unknown mechanism

}

#===============================================================================
sub process_data {
    my $ref = shift; # p $ref; # href

    # DoB datetime to date:
    if ( my $dob = $ref->{patient_dob} ) {
        $ref->{patient_dob} = $dob->ymd;
    }

    $ref->{expected_sample_number} = 2; # always tumour + germline samples
    $ref->{order_submitted} = 0; # will be submitted elsewhere
    # primary sample; 0 = germline sample, 1 = diagnostic sample:
    # TODO: or could use sample_type_dispatched if opts not shared with diagnostic sample
    $ref->{primary_sample} = $ref->{_specimen_type} =~ /germline/ ? 0 : 1;
    # assume received & dispatched are the same, avoid 2 identical query lookups:
    $ref->{sample_type_received} = $ref->{sample_type_dispatched};
    # NGIS code for WGS (based on flow phenotype - AML/ALL):
    $ref->{test_id} = 'M80.1'; # TODO: AML code but needs to be dynamic

    $ref->{sample_date_sent} = $today->ymd; # assumes daily procedure

    # determine date lab-test requested (replaced by datetime_order_received):
    # $ref->{datetime_test_requested} = get_test_request_date($ref); # will need to be revived

    my $data = $ngis->finalise_data($ref); # p $data; # adds some new fields, sets others undef
    return $data;
}

sub _get_main_query { # p $_[0]; # optional aref of request_id's
    # get request_id's for main query or return undef, triggers script exit:
    my @request_ids = @{ $_[0] } || do {
        my ($sql, @bind) = _get_requests(); # p $sql; p @bind;
        $dbix->query( $sql, @bind )->array;
    } or return undef; # p @request_ids;

    my $cols = _main_query_cols(); # these keep changing
    my @rels = (                                'requests|r'                   ,
        q{r.patient_case_id=pc.id}           => 'patient_case|pc'              ,
        q{pc.patient_id=p.id}                => 'patients|p'                   ,
        q{pc.referral_source_id=rs.id}       => 'referral_sources|rs'          ,
        q{rs.parent_organisation_id=po.id}   => 'parent_organisations|po'      ,
        q{r.referrer_department_id=rd.id}    => 'referrer_department|rd'       ,
        q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd'      ,
        q{rd.referrer_id=ref.id}		     => 'referrers|ref'                ,
        q{ts.request_id=r.id}                => 'request_lab_test_status|ts'   ,
        q{ts.status_option_id=so.id}         => 'lab_test_status_options|so'   ,
        q{ts.lab_test_id=lt.id}              => 'lab_tests|lt'                 ,
        q{lt.lab_section_id=ls.id}           => 'lab_sections|ls'              ,
        q{tr.request_id=r.id}                => 'request_lab_test_results|tr'  ,
        q{tr.lab_test_id=lt2.id}             => 'lab_tests|lt2'                ,
        q{rsp.request_id=r.id}               => 'request_specimen|rsp'         ,
        q{rsp.specimen_id=s.id}              => 'specimens|s'                  ,
      # left joins:
        q{=>pd.patient_id=p.id}              => 'patient_demographics|pd'      ,
        q{=>rrv.request_id=r.id}             => 'request_report_view|rrv'      ,
        q{=>rst.request_id=r.id}             => 'request_storage|rst'          ,
        q{=>rst.rack_id=sr.id}               => 'storage_racks|sr'             ,
        q{=>rsd.request_id=r.id}             => 'request_specimen_detail|rsd'  ,
        q{=>rer.request_id=r.id}             => 'request_external_ref|rer'     ,
        q{=>ro.request_id=r.id}              => 'request_option|ro'            ,
        q{=>ro.option_id=ao.id}              => 'additional_options|ao'        ,
    );

    my %where = (
        'lt.test_name' => { -in => \@TEST_NAMES },
        'r.id'         => { -in => \@request_ids },
    );
    my @args = (
		-columns  => $cols,
		-from     => [ -join => @rels ],
		-where    => \%where,
        -group_by => 'r.id', # for group_concat on specimen
        -order_by => 'r.id',
    #    -limit => 5,
    #    -offset => 350,
    ); # p @args;
    my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
        $dbix->dump_query($sql, @bind) if $query_output; # exit;
    return ($sql, @bind);
}

=begin # get data as single query - very slow
sub _get_main_query {
    my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!;
    my $begin_date = $ref_date->clone->set_day(1); # first day of $ref_date month

    my %date_restriction = (
        -between => [ # first & last day of ref_date month:
            $begin_date->ymd, # 1st day
            $begin_date->add(months => 1)->subtract(days => 1)->ymd, # last day
        ]
    );
    my $excluded_parents = $ngis->excluded_parent_codes;
    my $lab_sections     = $ngis->included_lab_sections;
    # my $excluded_lab_test_ids = $ngis->excluded_lab_test_ids;
    # my $excluded_test_name_re = $ngis->excluded_test_names;

    my $cols = _main_query_cols(); # these keep changing
    my @rels = (                                'requests|r'                   ,
        q{r.patient_case_id=pc.id}           => 'patient_case|pc'              ,
        q{pc.patient_id=p.id}                => 'patients|p'                   ,
        q{pc.referral_source_id=rs.id}       => 'referral_sources|rs'          ,
        q{rs.parent_organisation_id=po.id}   => 'parent_organisations|po'      ,
        q{r.referrer_department_id=rd.id}    => 'referrer_department|rd'       ,
        q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd'      ,
        q{rd.referrer_id=ref.id}		     => 'referrers|ref'                ,
        q{ts.request_id=r.id}                => 'request_lab_test_status|ts'   ,
        q{ts.status_option_id=so.id}         => 'lab_test_status_options|so'   ,
        q{ts.lab_test_id=lt.id}              => 'lab_tests|lt'                 ,
        q{lt.lab_section_id=ls.id}           => 'lab_sections|ls'              ,
        q{rsv.request_id=r.id}               => 'request_status_view|rsv'      ,
        q{ris.request_id=r.id}               => 'request_initial_screen|ris'   ,
        q{ris.screen_id=s2.id}               => 'screens|s2'                   ,
        q{tr.request_id=r.id}                => 'request_lab_test_results|tr'  ,
        q{tr.lab_test_id=lt2.id}             => 'lab_tests|lt2'                ,
        q{rsp.request_id=r.id}               => 'request_specimen|rsp'         ,
        q{rsp.specimen_id=s.id}              => 'specimens|s'                  ,
      # left joins:
        q{=>ro.request_id=r.id}              => 'request_option|ro'            ,
        q{=>ro.option_id=ao.id}              => 'additional_options|ao'        ,
        q{=>pd.patient_id=p.id}              => 'patient_demographics|pd'      ,
        q{=>rt.request_id=r.id}              => 'request_trial|rt'             ,
        q{=>rrv.request_id=r.id}             => 'request_report_view|rrv'      ,
        q{=>rst.request_id=r.id}             => 'request_storage|rst'          ,
        q{=>rst.rack_id=sr.id}               => 'storage_racks|sr'             ,
        q{=>rsd.request_id=r.id}             => 'request_specimen_detail|rsd'  ,
        q{=>rer.request_id=r.id}             => 'request_external_ref|rer'     ,
    );
    # probably not required
       # q{=>nd.ngis_indication_id=nci.id}    => 'ngis_clinical_indications|nci',
       # q{=>rrv.diagnosis_id=d.id}           => 'diagnoses|d'                  ,
       # q{=>nd.diagnosis_id=d.id}            => 'ngis_diagnosis|nd'            ,
       # q{=>ro.request_id=r.id}              => 'request_option|ro'            ,
       # q{=>ro.option_id=ao.id}              => 'additional_options|ao'        ,
       # q{=>pd.patient_id=p.id}              => 'patient_demographics|pd'      ,
       # q{=>rt.request_id=r.id}              => 'request_trial|rt'             ,

    my @where = ( # contains repeated elements (eg parent_code), so requires array
        'rt.request_id'        => undef,      # not in clinical trial
        'ls.section_name'      => { -in => $lab_sections }, # deadly slow query without this
        'lt.test_name'         => { -in => \@TEST_NAMES },
        'so.description'       => { '!=' => 'complete' },     # lab-test status
        'rs.organisation_code' => { -not_rlike => '^NT' },    # private hospital
        'po.parent_code'       => { -not_rlike => '^(S|Z)' }, # scotland/ireland
        'po.parent_code'       => { -not_in => $excluded_parents },
        # not required here ?
        # 'lt.id'              => { -not_in    => $excluded_lab_test_ids },
        # 'lt.test_name'       => { -not_rlike => $excluded_test_name_re },
    );
    # restrict on request id's if supplied, or date:
    if (@REQUEST_IDS) {
        push @where, ( 'r.id' => { -in => \@REQUEST_IDS } );
    }
    else {
        push @where, ( 'DATE(ts.time)' => \%date_restriction );
    } # p @where;

    my @args = (
		-columns  => $cols,
		-from     => [ -join => @rels ],
        # contains repeated elements (eg parent_code), so requires "-and => aref"
		-where    => { -and => \@where },
        -group_by => [ 'lt.id', 'r.id' ],
        -having   => { $is_private => undef }, # not private patient
        -order_by => 'r.id',
    #    -limit => 5,
    #    -offset => 350,
    ); # p @args;
    # need to supply our own injection_guard to override built-in sql injection
    # attack prevention which detects ';' in GROUP_CONCAT, but we don't need one
    # here as we have no user input:
    my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ );
    my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
         $dbix->dump_query($sql, @bind) if $query_output; # exit;
    return ($sql, @bind);
}
=cut

sub _get_requests {
    my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!;

    my $excluded_parents = $ngis->excluded_parent_codes;

    my $cols = 'r.id';
    my @rels = (                                'requests|r'                   ,
        q{r.patient_case_id=pc.id}           => 'patient_case|pc'              ,
        q{pc.referral_source_id=rs.id}       => 'referral_sources|rs'          ,
        q{rs.parent_organisation_id=po.id}   => 'parent_organisations|po'      ,
        q{ts.request_id=r.id}                => 'request_lab_test_status|ts'   ,
        q{ts.status_option_id=so.id}         => 'lab_test_status_options|so'   ,
        q{ts.lab_test_id=lt.id}              => 'lab_tests|lt'                 ,
      # left joins:
        q{=>ro.request_id=r.id}              => 'request_option|ro'            ,
        q{=>ro.option_id=ao.id}              => 'additional_options|ao'        ,
        q{=>rt.request_id=r.id}              => 'request_trial|rt'             ,
     );

    my @where = ( # contains repeated elements (eg parent_code), so requires array
        'rt.request_id'        => undef,      # not in clinical trial
        'lt.test_name'         => { -in => \@TEST_NAMES },
        'so.description'       => { '!=' => 'complete' },     # lab-test status
        'rs.organisation_code' => { -not_rlike => '^NT' },    # private hospital
        'po.parent_code'       => { -not_rlike => '^(S|Z)' }, # scotland/ireland
        'po.parent_code'       => { -not_in => $excluded_parents },
    );
    my @args = (
		-columns  => $cols,
		-from     => [ -join => @rels ],
        # contains repeated elements (eg parent_code), so requires "-and => aref"
		-where    => { -and => \@where },
        -group_by => 'r.id', # required for 'having'
        -having   => { $is_private => undef }, # not private patient
    #    -limit => 5,
    #    -offset => 350,
    ); # p @args;
    my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
        $dbix->dump_query($sql, @bind) if $query_output; # exit;
    return ($sql, @bind);
}

sub _main_query_cols {
    my $local_lab_name = $ngis->local_lab_name; # p $local_lab_name;

    my @cols = (
        'CONCAT_WS("/",  r.request_number, r.year - 2000)|local_sample_id',
        'NULL|local_sample_id_sub',
        'NULL|local_dna_number',
        'NULL|local_dna_number_sub',
        'p.id|internal_patient_id',
        'r.id|internal_request_id',
        qq!"$local_lab_name"|booking_laboratory!,
        qq!"YNEGLH Leeds Shire"|processing_laboratory!, # for WGS
        qq!"Illumina"|testing_laboratory!,              # for WGS
        'NULL|ped_number',
        'p.first_name|patient_first_name',
        'p.last_name|patient_last_name',
        'p.dob|patient_dob',
        'p.gender|patient_sex',
        'p.nhs_number',
        'pd.address|patient_address',
        'NULL|patient_district_of_residence',
        'NULL|patient_county',
        'NULL|patient_city',
        'pd.post_code|patient_postcode',
        'NULL|patient_country',
        'ref.national_code|referrer_unique_code',
        'NULL|referrer_email',
        'NULL|referrer_telephone',
        'NULL|referrer_title',
        q!SUBSTRING_INDEX(ref.name, ' ', -1)|referrer_first_name!,
        q!LEFT(ref.name, CHAR_LENGTH(ref.name) - LOCATE(' ', REVERSE(ref.name)))
            AS referrer_last_name!,
        'hd.display_name|referrer_specialty',   # required spelling !!
        'rs.display_name|referring_facility_name',
        'NULL|referrer_address',
        'NULL|referrer_city',
        'NULL|referrer_district',
        'NULL|referrer_county',
        'NULL|referrer_postcode',
        'NULL|referrer_country',
        'rs.organisation_code|referring_facility_ods_code',
        'NULLIF( pc.unit_number, "UNKNOWN")|unit_number', # NULL if a = b
        'NULL|other_unit_number',
        'IFNULL(rrv.clinical_details, "DiagnosticTest")|referral_reason', # a or b
        q!"Routine"|test_priority!,
        'NULL|test_id', # NGIS test id, will depend on diagnosis (AML vs ALL)
        'lt.field_label|test_name', # needs to match test directory
        'NULL|ngis_id', # NGIS order number
        'rer.external_reference|external_sample_id',
        'NULL|external_patient_id',
        'NULL|sample_type_received', # set later, same as sample_type_dispatched
        'rsd.specimen_date|datetime_sample_collected',  # datetime
        'r.created_at|datetime_sample_received',        # datetime
        'r.created_at|datetime_order_received',         # datetime
        'NULL|datetime_processing_complete',            # set later
        'rsd.specimen_quality|sample_condition',        # not available at upload
        q!MAX( CASE WHEN ao.option_name = 'doi'
            THEN 'yes' ELSE 'no' END ) AS infection_risk!,
        'NULL|sample_transfer_request_comments',
        q!MAX( CASE WHEN lt2.test_name = 'sample_type' THEN tr.result END )
            AS sample_type_dispatched!,
        # WGS samples are whole (BM or PB) so will never have DNA vol, conc, etc
        'IF( rst.volume, "ul", NULL )|sample_units',
        'rst.volume|sample_amount',
        q!IF( rst.concentration < 9999, rst.concentration, NULL )
            AS dna_concentration!,
        'NULL|sample_date_sent',
        'rst.vialId|sample_tube_id',
        'sr.plateId|sample_rack_id',
        'NULL|sample_courier',
        'NULL|sample_courier_ref',
        'NULL|expected_sample_number',
        'NULL|order_submitted',
        'NULL|primary_sample',

    # fields used for data processing only (underscored for omission in @headers):
        'lt.test_name|_test_name',
        'ls.section_name|_section_name',
        # to test for germline sample, superfluous if using sample_type_dispatched:
        'GROUP_CONCAT(DISTINCT(s.description))|_specimen_type',
    );
    return wantarray ? @cols : \@cols;
}