#!/usr/bin/env perl
=begin -------------------------------------------------------------------------
NGIS data extraction to csv - shares code with geneq_data_extraction.pl
usage: perl $0 [-tlqr] -d <days> [-p <port>]
* --query|q = output query to stdout
* --test|t = saves file locally
* --months|m = duration (months), defaults to 1
* --reqId 10 --reqId 100 --reqId 10000, etc
TODO: what triggers extract, how often, what type of request
TODO: query assumes WGS where sample extracted in Genetics lab & processed at
Illumina - will need adapting if/when arrangement extended to non-WGS samples
=cut
#-------------------------------------------------------------------------------
my $query_output = 0; # --query|q - output sql queries to console
my $duration = 1; # --month|m - default unless specified in command-line opts
my $testing = 0; # --testing|t - saves file locally, doesn't email
my @REQUEST_IDS = (); # --request_id 1 --request_id 2, etc
my @TEST_NAMES = qw( wgs ); # tests to trigger data transfer
my @null_fields = qw(); # force fields to null if required (temporary measure)
use Getopt::Long;
GetOptions (
"months|m=i" => \$duration, # int
"testing|t" => \$testing, # flag
"query|q" => \$query_output, # flag
"reqId=i" => \@REQUEST_IDS, # int list (optional)
); # warn $duration; warn $cumulative; exit;
use strict;
use warnings;
my $JUST_TESTING = $testing || 0;
use lib '/home/raj/perl5/lib/perl5';
use FindBin qw($Bin); # warn $Bin;
use Text::CSV;
use Data::Printer;
use SQL::Abstract::More;
use lib $Bin . '/../../../lib';
use LIMS::Local::ScriptHelpers;
use LIMS::Local::NGIS;
use lib '/home/raj/perl-lib';
use Local::DB;
my $tools = LIMS::Local::ScriptHelpers->new();
$tools->test_only($JUST_TESTING);
my $config = $tools->config();
my $today = $tools->time_now;
my $dbix = Local::DB->dbix({ dbname => 'hilis4' }); # p $dbix;
$dbix->lc_columns = 0; # preserve mixed case on col names (now standardised)
# get NGIS object:
my $ngis = do {
my %args = (
dbix => $dbix, query_output => $query_output, test_mode => $JUST_TESTING
);
LIMS::Local::NGIS->new(%args);
}; # p $ngis;
$Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 0; # force iso8601 datetime
#-------------------------------------------------------------------------------
my $ref_date = $today->clone->subtract( months => $duration ); # warn $ref_date;
my $filename = sprintf 'GeNEQ_HMDS_data_%s.csv', $ref_date->strftime('%b_%Y'); # warn $filename;
$ngis->set_t0(); # start timer for data processing
my $query = do {
my ($sql, @bind) = _get_main_query(); # p $sql; p @bind;
$dbix->query( $sql, @bind );
};
{ # retrieve data from query and populate $ngis->data:
my $data = $query->hashes; $ngis->runtimer('query runtime'); # p $data; exit;
# pre-process data to reconstitute (some) molecular panels and split (some)
# FISH panels, stores data inside $ngis object:
$ngis->preprocess_data($data);
}
my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1, eol => $/ });
open my $fh, ">:encoding(utf8)", "$Bin/$filename" or die $!;
# get cols from query, except 'private' ones used for evaluations (eg _datetime_screened):
my @headers = grep $_ !~ /^_/, $query->columns; # p @headers;
$csv->print($fh, \@headers);
# store in ngis object for data generation:
$ngis->col_headers(\@headers);
$ngis->null_fields(\@null_fields);
# retrieve preprocessed data from $ngis:
my @data = @{$ngis->data}; # p @data; # AoH
for my $ref (@data) { # p $ref;
my $row = process_data($ref); # returns arrayref
$csv->print($fh, $row);
}
close $fh or die $!;
$ngis->runtimer('completed data processing');
if ($JUST_TESTING) {
}
else { # transfer file using unknown mechanism
}
#===============================================================================
sub process_data {
my $ref = shift; # p $ref; # href
# DoB datetime to date:
if ( my $dob = $ref->{patient_dob} ) {
$ref->{patient_dob} = $dob->ymd;
}
$ref->{expected_sample_number} = 2; # always tumour + germline samples
$ref->{order_submitted} = 0; # will be submitted elsewhere
# primary sample: 1 = diagnostic sample, 0 = germline sample
$ref->{primary_sample} = $ref->{_specimen_code} eq 'GLS' ? 0 : 1;
# don't need 2 identical query lookups:
$ref->{sample_type_dispatched} = $ref->{sample_type_received};
# NGIS code for WGS:
$ref->{test_id} = 'XXX'; # todo - from ngis_lab_test table
# determine date lab-test requested (replaced by datetime_order_received):
# $ref->{datetime_test_requested} = get_test_request_date($ref); # will need to be revived
my $data = $ngis->finalise_data($ref); # p $data; # adds some new fields, sets others undef
return $data;
}
sub _main_query_cols {
my $local_lab_name = $ngis->local_lab_name; # p $local_lab_name;
my @cols = (
'CONCAT_WS("/", r.request_number, r.year - 2000)|local_sample_id',
'NULL|local_sample_id_sub',
'NULL|local_dna_number',
'NULL|local_dna_number_sub',
'p.id|internal_patient_id',
'r.id|internal_request_id',
qq!"$local_lab_name"|booking_laboratory!,
qq!"YNEGLH Leeds Shire"|processing_laboratory!, # for WGS
qq!"Illumina"|testing_laboratory!, # for WGS
'NULL|ped_number',
'p.first_name|patient_first_name',
'p.last_name|patient_last_name',
'p.dob|patient_dob',
'p.gender|patient_sex',
'p.nhs_number',
'pd.address|patient_address',
'NULL|patient_district_of_residence',
'NULL|patient_county',
'NULL|patient_city',
'pd.post_code|patient_postcode',
'NULL|patient_country',
'ref.national_code|referrer_unique_code',
'NULL|referrer_email',
'NULL|referrer_telephone',
'NULL|referrer_title',
q!SUBSTRING_INDEX(ref.name, ' ', -1)|referrer_first_name!,
q!LEFT(ref.name, CHAR_LENGTH(ref.name) - LOCATE(' ', REVERSE(ref.name)))
AS referrer_last_name!,
'hd.display_name|referrer_specialty', # required spelling !!
'rs.display_name|referring_facility_name',
'NULL|referrer_address',
'NULL|referrer_city',
'NULL|referrer_district',
'NULL|referrer_county',
'NULL|referrer_postcode',
'NULL|referrer_country',
'rs.organisation_code|referring_facility_ods_code',
'NULLIF( pc.unit_number, "UNKNOWN")|unit_number', # NULL if a = b
'NULL|other_unit_number',
'IFNULL(rrv.clinical_details, "DiagnosticTest")|referral_reason', # a or b
q!"Routine"|test_priority!,
'NULL|test_id', # NGIS test id
'lt.field_label|test_name',
'NULL|ngis_id', # NGIS order number
'rer.external_reference|external_sample_id',
'NULL|external_patient_id',
q!MAX( CASE WHEN lt2.test_name = 'sample_type' THEN tr.result END )
AS sample_type_received!,
'rsd.specimen_date|datetime_sample_collected', # datetime
'r.created_at|datetime_sample_received', # datetime
'r.created_at|datetime_order_received', # datetime
'NULL|datetime_processing_complete', # set later
'rsd.specimen_quality|sample_condition', # not available at upload
q!MAX( CASE WHEN ao.option_name = 'doi'
THEN 'yes' ELSE 'no' END ) AS infection_risk!,
'NULL|sample_transfer_request_comments',
'NULL|sample_type_dispatched', # same as sample_type_received
'IF( rst.volume, "ul", NULL )|sample_units',
'rst.volume|sample_amount',
q!IF( rst.concentration < 9999, rst.concentration, NULL )
AS dna_concentration!,
'NULL|sample_date_sent',
'rst.vialId|sample_tube_id',
'sr.plateId|sample_rack_id',
'NULL|sample_courier',
'NULL|sample_courier_ref',
'NULL|expected_sample_number',
'NULL|order_submitted',
'NULL|primary_sample',
# fields used for data processing only (underscored for omission in @headers):
'lt.test_name|_test_name',
'ls.section_name|_section_name',
's.specimen_code|_specimen_code',
# probably not needed ?
# 'lt.id|_lab_test_id',
# 'lt.test_type|_test_type',
# 's2.description|_presentation',
# 'ts.time|_datetime_test_completed', # date lab-test set to complete
# 'd.name|_diagnosis',
# 'nci.code|_clinical_indication_code',
# 'rrv.diagnosis_id|_diagnosis_id',
# q!MAX( CASE WHEN rsv.action = 'authorised' THEN rsv.time END )
# AS _datetime_authorised!,
);
return wantarray ? @cols : \@cols;
}
sub _get_main_query {
my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!;
my $begin_date = $ref_date->clone->set_day(1); # first day of $ref_date month
my %date_restriction = (
-between => [ # first & last day of ref_date month:
$begin_date->ymd, # 1st day
$begin_date->add(months => 1)->subtract(days => 1)->ymd, # last day
]
);
my $excluded_parents = $ngis->excluded_parent_codes;
my $lab_sections = $ngis->included_lab_sections;
# my $excluded_lab_test_ids = $ngis->excluded_lab_test_ids;
# my $excluded_test_name_re = $ngis->excluded_test_names;
my $cols = _main_query_cols(); # these keep changing
my @rels = ( 'requests|r' ,
q{r.patient_case_id=pc.id} => 'patient_case|pc' ,
q{pc.patient_id=p.id} => 'patients|p' ,
q{pc.referral_source_id=rs.id} => 'referral_sources|rs' ,
q{rs.parent_organisation_id=po.id} => 'parent_organisations|po' ,
q{r.referrer_department_id=rd.id} => 'referrer_department|rd' ,
q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd' ,
q{rd.referrer_id=ref.id} => 'referrers|ref' ,
q{ts.request_id=r.id} => 'request_lab_test_status|ts' ,
q{ts.status_option_id=so.id} => 'lab_test_status_options|so' ,
q{ts.lab_test_id=lt.id} => 'lab_tests|lt' ,
q{lt.lab_section_id=ls.id} => 'lab_sections|ls' ,
q{rsv.request_id=r.id} => 'request_status_view|rsv' ,
q{ris.request_id=r.id} => 'request_initial_screen|ris' ,
q{ris.screen_id=s2.id} => 'screens|s2' ,
q{tr.request_id=r.id} => 'request_lab_test_results|tr' ,
q{tr.lab_test_id=lt2.id} => 'lab_tests|lt2' ,
q{rsp.request_id=r.id} => 'request_specimen|rsp' ,
q{rsp.specimen_id=s.id} => 'specimens|s' ,
# left joins:
q{=>ro.request_id=r.id} => 'request_option|ro' ,
q{=>ro.option_id=ao.id} => 'additional_options|ao' ,
q{=>pd.patient_id=p.id} => 'patient_demographics|pd' ,
q{=>rt.request_id=r.id} => 'request_trial|rt' ,
q{=>rrv.request_id=r.id} => 'request_report_view|rrv' ,
q{=>rst.request_id=r.id} => 'request_storage|rst' ,
q{=>rst.rack_id=sr.id} => 'storage_racks|sr' ,
q{=>rsd.request_id=r.id} => 'request_specimen_detail|rsd' ,
q{=>rer.request_id=r.id} => 'request_external_ref|rer' ,
);
=begin # probably not required
q{=>nd.ngis_indication_id=nci.id} => 'ngis_clinical_indications|nci',
q{=>rrv.diagnosis_id=d.id} => 'diagnoses|d' ,
q{=>nd.diagnosis_id=d.id} => 'ngis_diagnosis|nd' ,
q{=>ro.request_id=r.id} => 'request_option|ro' ,
q{=>ro.option_id=ao.id} => 'additional_options|ao' ,
q{=>pd.patient_id=p.id} => 'patient_demographics|pd' ,
q{=>rt.request_id=r.id} => 'request_trial|rt' ,
=cut
my @where = ( # contains repeated elements (eg parent_code), so requires array
'rt.request_id' => undef, # not in clinical trial
'ls.section_name' => { -in => $lab_sections }, # deadly slow query without this
'lt.test_name' => { -in => \@TEST_NAMES },
'so.description' => { '!=' => 'complete' }, # lab-test status
'rs.organisation_code' => { -not_rlike => '^NT' }, # private hospital
'po.parent_code' => { -not_rlike => '^(S|Z)' }, # scotland/ireland
'po.parent_code' => { -not_in => $excluded_parents },
# not required here ?
# 'lt.id' => { -not_in => $excluded_lab_test_ids },
# 'lt.test_name' => { -not_rlike => $excluded_test_name_re },
);
# restrict on request id's if supplied, or date:
if (@REQUEST_IDS) {
push @where, ( 'r.id' => { -in => \@REQUEST_IDS } );
}
else {
push @where, ( 'DATE(ts.time)' => \%date_restriction );
} # p @where;
my @args = (
-columns => $cols,
-from => [ -join => @rels ],
# contains repeated elements (eg parent_code), so requires "-and => aref"
-where => { -and => \@where },
-group_by => [ 'lt.id', 'r.id' ],
-having => { $is_private => undef }, # not private patient
-order_by => 'r.id',
# -limit => 5,
# -offset => 350,
); # p @args;
# need to supply our own injection_guard to override built-in sql injection
# attack prevention which detects ';' in GROUP_CONCAT, but we don't need one
# here as we have no user input:
my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ );
my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
$dbix->dump_query($sql, @bind) if $query_output; # exit;
return ($sql, @bind);
}