#!/usr/bin/env perl

=begin -------------------------------------------------------------------------
NGIS data extraction; check ngis_* tables up-to-date

runs at 6am on 1st & 3rd Friday of month via cron:
0 6 1-7 * * [ "$(date '+\%a')" = "Fri" ] && $MULTI/$0 ...
0 6 15-21 * * [ "$(date '+\%a')" = "Fri" ] && $MULTI/$0 ...

manual run: $0 options:
    -m = months
    -t = testing: saves files locally, doesn't email
    -q = query output
    --cumulative = all requests since 1st of month of ref date
    --console = verbose log to console
    --verbose = verbose log to file (overrides console)
    --reqId 10 --reqId 100 --reqId 10000, etc

maps hilis4 lab tests to an NGIS test code using hilis4.ngis_lab_test table:
if lab_test_id entry(s) exist in ngis_lab_test table:
* if single row then use ngis_test_code value (diagnosis doesn't matter so request
    doesn't need to be reported)
* if lab_test has multiple ngis_test_codes, the diagnosis clinical indication code
    (ngis_diagnosis.ngis_indication_id) is used to determine which (if any)
    ngis_test_code to use, where the diagnosis clinical indication must match
    one of the available ngis test id's eg HTS myeloid test has matching codes
    for a diagnosis of MDS, MPN, etc but not for CML
* if NGIS test code cannot be found on diagnosis, a loose matching on presentation
    is attempted, including previous screen terms used, eg 'MDS' matches
    'Suspected MDS', 'Follow-up MDS'
* last resort is to force NGIS test code onto some test names (eg HTS myeloid)

* recipient does not want technology (FISH, PCR, HTS, etc) so ETV6/RUNX1 (FISH)
    & ETV6-RUNX1 (PCR) potentially ambiguous

* molecular tests are reassembled into panels (if panel has NGIS code)
* FISH panels without NGIS codes are expanded to individual tests

TODO: for molecular panels when 1 or more tests set complete in different months,
    panel not recognised and separate test codes returned eg 20536/19

complications (see geneq_data_extract.log):
* hilis4 lab-tests/panels without ngis test id's - require default codes
* lab tests/panels without a clinical indication mapped to diagnosis or screen
* diagnostic terms without clinical indication (eg 'see comments', 'lab tests only', etc)
* requests not reported when data collected
=cut
#-------------------------------------------------------------------------------

#===============================================================================
my @data_recipients = qw( paul.mcintosh john.fraser raj );
my @log_recipients  = qw( turner talley raj );
#===============================================================================

# new template wants some cols temporarily NULL (commented out in query now):
my @null_fields = qw//;

my $query_output = 0; # --query|q - output sql queries to console
my $log_console  = 0; # --console - verbose log to console
my $verbose_log  = 0; # --verbose - verbose log to file (overrides --console)
my $cumulative   = 0; # --cumulative - all requests since 1st of month of ref_date
my $duration     = 1; # --month|m - default unless specified in command-line opts
my $testing      = 0; # --testing|t - saves file locally, doesn't email
my @REQUEST_IDS  = (); # --request_id 1 --request_id 2, etc

use Getopt::Long;
GetOptions (
    "months|m=i" => \$duration,       # int
    "testing|t"  => \$testing,        # flag
    "query|q"    => \$query_output,   # flag
    "console"    => \$log_console,    # flag
    "verbose"    => \$verbose_log,    # flag
    "cumulative" => \$cumulative,     # flag
    "reqId=i"    => \@REQUEST_IDS,    # int list (optional)
); # warn $duration; warn $cumulative; exit;

use strict;
use warnings;

my $JUST_TESTING = $testing || 0;

use lib '/home/raj/perl5/lib/perl5';
use FindBin qw($Bin); # warn $Bin;

use Data::Printer;
use Log::Any qw($log);
use Log::Any::Adapter;
use SQL::Abstract::More;
use List::Util qw(maxstr);
use Spreadsheet::WriteExcel::Simple;

use lib $Bin . '/../../../lib';
use LIMS::Local::ScriptHelpers;
use LIMS::Local::NGIS;

use lib '/home/raj/perl-lib';
use Local::WriteExcel;
use Local::DB;

my $tools = LIMS::Local::ScriptHelpers->new();
$tools->test_only($JUST_TESTING);

my $config = $tools->config();
my $today  = $tools->time_now;
my $dbix   = Local::DB->dbix({ dbname => 'hilis4' });
$dbix->lc_columns = 0; # preserve mixed case on col names (now standardised)
# get NGIS object:
my $ngis = do {
    my %args = (
        dbix => $dbix, query_output => $query_output, test_mode => $JUST_TESTING
    );
    LIMS::Local::NGIS->new(%args);
}; # p $ngis;
$Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 1; # or will defeat DATE(col)

#-------------------------------------------------------------------------------
my $ref_date = $today->clone->subtract( months => $duration ); # warn $ref_date;
my $filename = sprintf 'GeNEQ_HMDS_data_%s.xls', $ref_date->strftime('%b_%Y'); # warn $filename;
my $subject  = sprintf 'GeNEQ HMDS data %s', $ref_date->strftime('%b %Y');

my $logfile  = sprintf '%s/geneq_data_extract_%s.log',
    $Bin . '/../../../logs/geneq', lc $ref_date->strftime('%b_%Y'); # warn $logfile;

# for verbose logging to console (can be set or reset later):
Log::Any::Adapter->set('Stdout') if $log_console;
# for verbose logging to logfile (can be set or reset later):
Log::Any::Adapter->set(File => $logfile) if $verbose_log;

$ngis->set_t0(); # start timer for data processing

my $query = do {
    my ($sql, @bind) = _get_main_query(); # p $sql; p @bind;
    $dbix->query( $sql, @bind );
};
{ # retrieve data from query and populate $ngis->data:
    my $data = $query->hashes; $ngis->runtimer('query runtime'); # p $data; exit;
    # pre-process data to reconstitute (some) molecular panels and split (some)
    # FISH panels, stores data inside $ngis object:
    $ngis->preprocess_data($data);
}

my $xl = Spreadsheet::WriteExcel::Simple->new;
my @headers = grep $_ !~ /^_/, $query->columns; # p @headers;
$xl->write_bold_row(\@headers);

# store in ngis object for data generation:
$ngis->col_headers(\@headers);
$ngis->null_fields(\@null_fields);

# retrieve preprocessed data from $ngis:
my @data = @{$ngis->data}; # p @data; # AoH
for my $ref (@data) { # p $ref;
    my $row = process_data($ref); # returns arrayref
	$xl->write_row($row);
}
$ngis->runtimer('completed data processing');

if ($JUST_TESTING) {
    $xl->save(join '/', $Bin, $filename);
}
else {
    my %mail = (
        attachment => $xl->data,
        filename   => $filename,
        subject    => $subject,
        config     => $config,
    );
    $tools->send_mail(\%mail, \@data_recipients);
}

# log all misfits:
_do_log();

#===============================================================================

sub process_data {
    my $ref = shift; # p $ref; # href

    # determine date lab-test requested (replaced by datetime_order_received):
    # $ref->{datetime_test_requested} = get_test_request_date($ref); # will need to be revived

    # set datetime_testing_complete to test completion datetime:
    $ref->{datetime_testing_complete} = $ref->{_datetime_test_completed};

    { # set datetime_reporting_complete to later of testing_complete or authorisation:
        if ( $ref->{_datetime_authorised} ) {
            my @dates = @{$ref}{ qw/_datetime_authorised _datetime_test_completed/ };
            # dates in yyyy-mm-dd format can be handled as strings for determination
            # of chronology using List::Util::maxstr or could do $dateA gt $dateB:
            $ref->{datetime_reporting_complete} = maxstr(@dates); # List::Util
        }
        else { # not authorised, set to _datetime_test_completed:
            $ref->{datetime_reporting_complete} = $ref->{_datetime_test_completed}
        }
    }
=begin # datetime_processing_complete now set to D/RNA extraction date
    # set datetime_processing_complete to later of authorisation date or test
    # completion date; value will be null if request not yet authorised:
        # p $ref->{sample_processed_date}; p $ref->{_datetime_authorised};
    if ( $ref->{_datetime_authorised} ) {
        my @dates = @{$ref}{ qw/_datetime_authorised datetime_testing_complete/ };
        # dates in yyyy-mm-dd format can be handled as strings for determination
        # of chronology using List::Util::maxstr or could do $dateA gt $dateB:
        $ref->{datetime_processing_complete} = maxstr(@dates); # List::Util
    } # else sample_completed_date remains null
=cut

    my $data = $ngis->finalise_data($ref); # adds some new fields, sets others undef
    return $data;
}

sub _main_query_cols {
    my $local_lab_name = $ngis->local_lab_name; # p $local_lab_name;

    my @cols = (
        'CONCAT_WS("/",  r.request_number, r.year - 2000)|local_sample_id',
#        'NULL|local_sample_id_sub',
#        'NULL|local_dna_number',
#        'NULL|local_dna_number_sub',
        'p.id|internal_patient_id',
        'r.id|internal_request_id',
        qq!"$local_lab_name"|booking_laboratory!,
        qq!"$local_lab_name"|processing_laboratory!,
        qq!"$local_lab_name"|testing_laboratory!,
        qq!"$local_lab_name"|reporting_laboratory!,
#        'NULL|ped_number',
#        'p.first_name|patient_first_name',
#        'p.last_name|patient_last_name',
#        'p.dob|patient_dob',
        'p.gender|patient_sex',
#        'p.nhs_number',
#        'pd.address|patient_address',
#        'NULL|patient_district_of_residence',
        'NULL|patient_city',
#        'pd.post_code|patient_postcode',
#        'NULL|patient_country',
        'ref.national_code|referrer_ods_code',
#        'NULL|referrer_email',
#        'NULL|referrer_telephone',
#        'NULL|referrer_title',
#        'NULL|referrer_first_name',
#        'ref.name|referrer_last_name',
        'hd.display_name|referrer_specialty',   # required spelling !!
#        'rs.display_name|referring_facility_name',
#        'NULL|referrer_address',
#        'NULL|referrer_city',
#        'NULL|referrer_district',
#        'NULL|referrer_postcode',
#        'NULL|referrer_county',
        'rs.organisation_code|referring_facility_ods_code',
#        'NULLIF( pc.unit_number, "UNKNOWN")|unit_number', # NULL if a = b
#        'NULL|other_unit_number',
#        'IFNULL(rrv.clinical_details, "DiagnosticTest")|referral_reason', # a or b
        q!"Routine"|test_priority!,
        'NULL|test_id', # NGIS test id (eg M80.6)
        'lt.field_label|test_name',
#        'ls.section_name|lab_section',
        'NULL|ngis_id', # NGIS order number ?
#        'NULL|datetime_test_requested', # replaced by datetime_order_received
#        'lt.test_type|request_comments', # panel or test
        q!GROUP_CONCAT(DISTINCT s.description SEPARATOR '; ')
            AS sample_type_received!,
        'rsd.specimen_date|datetime_sample_collected',  # datetime
        'r.created_at|datetime_sample_received',        # datetime
        'r.created_at|datetime_order_received',         # datetime
        'NULL|datetime_processing_complete',            # set later
        'NULL|datetime_testing_complete',               # set later
        'NULL|datetime_reporting_complete',             # set later
        'rsd.specimen_quality|sample_condition',
#        q!MAX( CASE WHEN ao.option_name = 'doi'
#            THEN 'yes' ELSE 'no' END ) AS infection_risk!,
#        'NULL|sample_type_dispatched',
#        'IF( rst.volume, "ul", NULL )|sample_units',
#        'rst.volume|sample_amount',
#        q!IF( rst.concentration < 9999, rst.concentration, NULL )
#            AS dna_concentration!,
#        'NULL|sample_date_sent',
#        'rst.vialId|sample_tube_id',
#        'sr.plateId|sample_rack_id',
#        'NULL|sample_courier',
#        'NULL|sample_courier_ref',

#        'NULL|sample_processed_date',  # renamed datetime_processing_complete
#        'NULL|sample_completed_date',  # renamed datetime_testing_complete

    # fields used for data processing only (underscored for omission in @headers):
        'lt.id|_lab_test_id',
        'lt.test_type|_test_type',
        'lt.test_name|_test_name',
        'ls.section_name|_section_name',
        'd.name|_diagnosis',
        'nci.code|_clinical_indication_code',
        's2.description|_presentation',
        'rrv.diagnosis_id|_diagnosis_id',
		# q!MAX( CASE WHEN rsv.action = 'screened' THEN rsv.time END )
        #    AS _datetime_screened!, # not used
		# q!MAX( CASE WHEN rsv.action = 'reported' THEN rsv.time END )
        #    AS _datetime_reported!, # not used
		q!MAX( CASE WHEN rsv.action = 'authorised' THEN rsv.time END )
            AS _datetime_authorised!,
        'ts.time|_datetime_test_completed', # date lab-test set to complete
    );
    return wantarray ? @cols : \@cols;
}

sub _get_main_query {
    my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!;
    my $begin_date = $ref_date->clone->set_day(1); # first day of $ref_date month

    my %date_restriction = $cumulative # if --cumulative Getopt:
        ? ( '>=', $begin_date->ymd ) # all since ref_date
        : ( -between => [ # first & last day of ref_date month:
                $begin_date->ymd, # 1st day
                $begin_date->add(months => 1)->subtract(days => 1)->ymd, # last day
            ] );
    my $excluded_lab_test_ids = $ngis->excluded_lab_test_ids;
    my $excluded_parent_codes = $ngis->excluded_parent_codes;
    my $excluded_test_name_re = $ngis->excluded_test_names;
    my $lab_sections          = $ngis->included_lab_sections;

    my $cols = _main_query_cols(); # these keep changing
    my @rels = (                                'requests|r'                   ,
        q{r.patient_case_id=pc.id}           => 'patient_case|pc'              ,
        q{pc.patient_id=p.id}                => 'patients|p'                   ,
        q{pc.referral_source_id=rs.id}       => 'referral_sources|rs'          ,
        q{rs.parent_organisation_id=po.id}   => 'parent_organisations|po'      ,
        q{r.referrer_department_id=rd.id}    => 'referrer_department|rd'       ,
        q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd'      ,
        q{rd.referrer_id=ref.id}		     => 'referrers|ref'                ,
        q{rsd.request_id=r.id}               => 'request_specimen_detail|rsd'  ,
        q{rsp.request_id=r.id}               => 'request_specimen|rsp'         ,
        q{rsp.specimen_id=s.id}              => 'specimens|s'                  ,
        q{ts.request_id=r.id}                => 'request_lab_test_status|ts'   ,
        q{ts.status_option_id=so.id}         => 'lab_test_status_options|so'   ,
        q{ts.lab_test_id=lt.id}              => 'lab_tests|lt'                 ,
        q{lt.lab_section_id=ls.id}           => 'lab_sections|ls'              ,
        q{rsv.request_id=r.id}               => 'request_status_view|rsv'      ,
        q{ris.request_id=r.id}               => 'request_initial_screen|ris'   ,
        q{ris.screen_id=s2.id}               => 'screens|s2'                   ,
      # left joins:
        q{=>rrv.request_id=r.id}             => 'request_report_view|rrv'      ,
        q{=>rrv.diagnosis_id=d.id}           => 'diagnoses|d'                  ,
        q{=>nd.diagnosis_id=d.id}            => 'ngis_diagnosis|nd'            ,
        q{=>nd.ngis_indication_id=nci.id}    => 'ngis_clinical_indications|nci',
        q{=>ro.request_id=r.id}              => 'request_option|ro'            ,
        q{=>ro.option_id=ao.id}              => 'additional_options|ao'        ,
        q{=>pd.patient_id=p.id}              => 'patient_demographics|pd'      ,
        q{=>rst.request_id=r.id}             => 'request_storage|rst'          ,
        q{=>rst.rack_id=sr.id}               => 'storage_racks|sr'             ,
        q{=>rt.request_id=r.id}              => 'request_trial|rt'             ,
    );
    my @where = ( # contains repeated elements (eg parent_code), so requires array
        'rt.request_id'        => undef,      # not in clinical trial
        'so.description'       => 'complete', # lab-test status
        'ls.section_name'      => { -in => $lab_sections },
        'rs.organisation_code' => { -not_rlike => '^NT' },    # private hospital
        'po.parent_code'       => { -not_rlike => '^(S|Z)' }, # scotland/ireland
        'po.parent_code'       => { -not_in => $excluded_parent_codes },
        'lt.test_name'         => { -not_rlike => $excluded_test_name_re },
        'lt.id'                => { -not_in    => $excluded_lab_test_ids },
    );
    # restrict on request id's if supplied, or date:
    if (@REQUEST_IDS) {
        push @where, ( 'r.id' => { -in => \@REQUEST_IDS } );
    }
    else {
        push @where, ( 'DATE(ts.time)' => \%date_restriction );
    } # p @where;

    my @args = (
		-columns  => $cols,
		-from     => [ -join => @rels ],
        # contains repeated elements (eg parent_code), so requires "-and => aref"
		-where    => { -and => \@where },
        -group_by => [ 'lt.id', 'r.id' ],
        -having   => { $is_private => undef }, # not private patient
        -order_by => 'r.id',
        #-limit => 100,
        #-offset => 100,
    ); # p @args;
    # need to supply our own injection_guard to override built-in sql injection
    # attack prevention which detects ';' in GROUP_CONCAT, but we don't need one
    # here as we have no user input:
    my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ );
    my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind;
         $dbix->dump_query($sql, @bind) if $query_output; # exit;
    return ($sql, @bind);
}

sub _do_log {
    my $stash = $ngis->log_stash; # p $stash;

    my $no_ngis_code = $stash->{no_ngis_code};
    my $unreported   = $stash->{unreported};
    my $no_ngis_code_requests  = $stash->{no_ngis_code_requests};
    my $no_clinical_indication = $stash->{no_clinical_indication};
    my $no_available_ngis_code = $stash->{no_available_ngis_code};

    my $filename = sprintf '%s/geneq_data_extract_log_%s.xlsx',
        $Bin . '/../../../logs/geneq', lc $ref_date->strftime('%b_%Y');
    my $xl = Local::WriteExcel->new( filename => $filename ); # p $xl;

    if ($no_clinical_indication) {
        $xl->worksheet_name('no clinical indication');
        my @cols = qw( lab_num test_name presentation diagnosis );
        $xl->write_bold_row(\@cols);
        $xl->write_row($_) for @$no_clinical_indication;
    }
    if ($no_available_ngis_code) {
        $xl->worksheet_name('no available ngis test-code');
        my @cols = qw( lab_num test_name ngis_codes diagnosis
            clinical_indication presentation );
        $xl->write_bold_row(\@cols);
        $xl->write_row($_) for @$no_available_ngis_code;
    }
    if ($no_ngis_code_requests) { # p $no_ngis_code_requests;
        $xl->worksheet_name('no ngis test-code');
        $xl->write_bold_row([ qw/lab_num test_name presentation diagnosis/ ] );
        $xl->write_row($_) for @$no_ngis_code_requests;
    }
    if ($no_ngis_code) { # p $no_ngis_code;
        $xl->worksheet_name('no ngis test-code summary');
        $xl->write_bold_row([ qw/test_name test_type lab_section count/ ] );
        $xl->write_row([ (split ' :: ', $_ ), $no_ngis_code->{$_} ])
            for sort keys %{ $no_ngis_code };
    }
    if ($unreported) {
        $xl->worksheet_name('unreported');
        $xl->write_bold_row([ qw/lab_num test_name presentation/] );
        $xl->write_row($_) for @$unreported;
    }

    if ($JUST_TESTING) {
        $xl->save();
    }
    else {
        my $subject = 'GeNEQ data extraction log '
            . $ref_date->strftime('%b %Y'); # p $subject;
        # capture filename from its full path:
        $filename =~ /(geneq_data_extract_log_.*)/; # p $1;

        my %mail = (
            attachment => $xl->data,
            filename   => $1, # captured from above
            subject    => $subject,
            config     => $config,
        );
        $tools->send_mail(\%mail, \@log_recipients);
    }
   { # text file:
        # log summary data to $logfile, override any previous adapter settings
        Log::Any::Adapter->set(File => $logfile);

        if ($no_clinical_indication) {
          $log->info('DIAGNOSIS & PRESENTATION WITH NO CLINICAL INDICATION');
          $log->info(join ' :: ', @$_)
            for sort test_name_order @$no_clinical_indication;
        }
        if ($no_available_ngis_code) { # p $no_available_ngis_code;
          $log->info('DIAGNOSIS WITH NO AVAILABLE NGIS CODE & PRESENTATION '
             . 'WITH NO CLINICAL INDICATION');
          $log->info(join ' :: ', @$_)
            for sort test_name_order @$no_available_ngis_code;
        }
        if ($unreported) {
          $log->info('UNREPORTED REQUESTS');
          $log->info(join ' :: ', @$_) for @$unreported;
        }
        if ($no_ngis_code) {
          $log->info('LAB TESTS WITH NO NGIS ID');
          $log->info(join ':: ', $_, $no_ngis_code->{$_})
            for sort keys %{ $no_ngis_code };
        }
    }

}

sub test_name_order { # p $a; p $b;
    my ($req_num_a, $yr_a) = split '/', $a->[0];
    my ($req_num_b, $yr_b) = split '/', $b->[0];

    my $test_name_a = $a->[1];
    my $test_name_b = $b->[1];

    return $test_name_a cmp $test_name_b
        || $yr_a <=> $yr_b || $req_num_a <=> $req_num_b;
}

sub _test_name_order { # p $a; p $b;
    my ($req_num_a, $yr_a, $test_name_a) = $a =~ m!^(\d+)/(\d{2})\: (.*) \[!; # p $test_name_a;
    my ($req_num_b, $yr_b, $test_name_b) = $b =~ m!^(\d+)/(\d{2})\: (.*) \[!; # p $test_name_b;

    return $test_name_a cmp $test_name_b
        || $yr_a <=> $yr_b || $req_num_a <=> $req_num_b;
}
