#!/usr/bin/perl

=begin
--------------------------------------------------------------------------------
generates NCRS XML data file for all authorised requests with diagnosis either
ICDO3 '/3', or 'see comments' with a non-null comment section, where authorisation
date or diagnosis revision date during previous 1 month

SCP file transfer

manual command-line:
    ncrs_cosd.pl [duration in months - optional (defaults to 1)]
    scp /tmp/<filename> hmds@163.160.247.17:<filename>

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!! for new server: need to manually ssh into server once to add key to known_hosts
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

online reference:
ICDO3 topographical codes: http://codes.iarc.fr/topography
ICDO3 morphological codes: http://codes.iarc.fr/codegroup/2

ICD-O-3 to ICD-10 conversion for hemato-oncological diseases:
<http://www.krebsdaten.de/Krebs/DE/Content/Publikationen/Poster/Downloads/2014/
hemato-oncological_diseases_classification_ottawa_engl.pdf?__blob=publicationFile>

All leukemias have a site-specific code of bone marrow (C42.1) except myeloid sarcoma
Multiple myeloma  code site to C42.1
RULE D. Topography codes for lymphomas: If a lymphoma involves multiple lymph node regions,
code to C77.8 (lymph nodes of multiple regions). Code extranodal lymphomas to the site of
origin, which may not be the site of the biopsy. If no site is indicated for a lymphoma,
code to C77.9 (lymph node, NOS). C80.9 if suspected to be extranodal but site is not stated.

--------------------------------------------------------------------------------
=cut

my $JUST_TESTING = 1; # skips file transfer & file archive

use lib '/home/raj/perl5/lib/perl5';
use XML::Simple qw(:strict); # need to specify ForceArray & KeyAttr if using :strict here
use Data::Printer;
use Config::Auto;
use Modern::Perl;
use DateTime; # last_day_of_month()
use IO::All;

use FindBin qw($Bin); # warn $Bin;
use lib $Bin . '/../../../lib';
use LIMS::Local::ScriptHelpers;

#############################################
my $delimiter = q{|};                       #
my $duration  = $ARGV[0] || 1; # months ago #
my $new_line_marker = '[NEW_LINE]';         #
#############################################

my $cfg_file = "$Bin/../lib/settings.cfg";
my $settings = Config::Auto::parse($cfg_file); # p $settings;

# get tools from LIMS::Local::ScriptHelpers:
my $tools = LIMS::Local::ScriptHelpers->new();

my $sql_lib = $tools->sql_lib();
my $config  = $tools->config();
my $dbix    = $tools->dbix();

# dates ========================================================================
# get date last month - handles BST:
my $ref_datetime = $tools->date_subtract(months => $duration);
my $timestamp    = $tools->time_now->datetime;

my $start_date = $ref_datetime->strftime('%Y-%m-01');
my $end_date   = DateTime->last_day_of_month(
    year  => $ref_datetime->year,
    month => $ref_datetime->month,
)->ymd; # p $start_date; p $end_date;
# ==============================================================================

my $data_filename = sprintf 'hmds_%s_%02d.xml',
    $ref_datetime->year, $ref_datetime->month; # p $data_filename;
# $data_filename = 'hmds_test_cosd.xml';

use constant HMDS_ORG_CODE => 'RR813'; # OrgCodePathReport & OrgCodeSubmitter
use constant TMP_DIR => '/tmp'; # where to create temp data file

my @data = ();
{ # go:
    # get requests 1-to-1 data:
    my @requests = do {
        my $query = $sql_lib->retr('ncrs_data'); # p $query;
        my @bind = ($start_date, $end_date, $start_date, $end_date); # p @bind;
        $dbix->query($query, @bind)->hashes;
    }; # p @requests;

    my @request_ids = map $_->{request_id}, @requests; # p @request_ids;
    # get data maps of vertical tables - much faster than tying into main query:
    my $data_map = get_data_map(\@request_ids); # p $data_map;

    for my $r (@requests) { # p $r; next;
        my $request_id = $r->{request_id};
        # merge vertical table data with $r:
        my $vars = combine_data($r, $data_map->{$request_id}); # p $r;
        push @data, $vars;
    } # p @data;
}

{ # create output file & push to remote server:
    my $file_path = join '/', TMP_DIR, $data_filename;

    my $content = to_xml(\@data); # p $content;
    $content > io($file_path);

    # transfer file:
    unless ($JUST_TESTING) {
        my %args = ( local_filename => $file_path );
        transfer_file(\%args);
    }
}

#-------------------------------------------------------------------------------
sub to_xml {
    my $data = shift; # arrayref of request datasets

    my $count = @$data;  p $count;

    # options for XMLout (needs array or hash):
    my %xs_opts = xs_options(); # p %xs_opts;
    my $xs = XML::Simple->new(%xs_opts);

    my $ref = {
        Id => {
            root => '18DA78B8-44A4-4AA7-95B0-87C198E6B0AE', # where does this come from ?
        },
        OrgCodeSubmitter => {
            extension => HMDS_ORG_CODE,
        },
        RecordCount => {
            value => $count,
        },
        ReportingPeriodStartDate => $start_date,
        ReportingPeriodEndDate   => $end_date,
        FileCreationDateTime     => $timestamp,
    };

    push @{ $ref->{COSDRecord} }, format_cosd_unit($_) for @$data; # p $ref;

    # enclose xml in outer <add> block; add ulisa 'version' inline:
    # my $input = { add => $vars, version => 2 };
   	my $xml_out = $xs->XMLout($ref); # p $xml_out;
    return $xml_out;
}

#-------------------------------------------------------------------------------
# formats repeating COSDRecord blocks:
sub format_cosd_unit {
    my $v = shift;

    my %COSDRecord = (
        Id => {
            root => 'B63AFD92-568E-4845-A0CE-2643FE15DE08', # where does this come from ?
        },
    );
    my %HAEM;
    { # COSDRecord/Haematology/HaematologyCore:
        my %h = (
            HaematologyCoreLinkagePatientId => {
                NHSNumber => {
                    extension => $v->{nhs_number},
                },
                NHSNumberStatusIndicator => {
                    code => '02', # Number present but not traced
                },
                LocalPatientId => $v->{unit_number},
                Birthdate => $v->{dob},
                OrgCodeOfProvider => {
                    extension => $v->{organisation_code},
                },
            },
            HaematologyCoreLinkageDiagnosis => {
                InvestigationResultDate => $v->{authorisation_date},
                ServiceReportId => $v->{labno},
            },
            HaematologyCoreDemographics => {
                PersonFamilyName => {
                    family => $v->{last_name},
                },
                PersonGivenName => {
                    given => given_name($v),
                },
                Address => {
                    UnstructuredAddress => {
                        streetAddressLine => $v->{address},
                    },
                },
                Postcode => {
                    postalCode => $v->{post_code},
                },
                Gender => {
                    code => gender_code($v->{gender}),
                },
            },
            HaematologyCorePathology => {
                PathologistConsultantCode => { extension => $v->{reporter_code} },
                PathTestReqCareProfCode   => { extension => $v->{referrer_code} },
                SampleCollectionDate      => { extension => $v->{sample_date}   },
                OrgCodePathReport         => { extension => HMDS_ORG_CODE },
                ServiceReportStatus       => { code => 1 }, # final (complete)

                PathTestReqSiteCode => { extension => $v->{organisation_code} },
                PathologyReportText => free_text_report($v),
                SampleReceiptDate   => $v->{request_date},
                MorphologySNOMED    => icdo3_to_snomed($v->{icdo3}), # ICDO3

                # useless for haematology - refers to solid tumours:
                # 1 = primary, 4 = region LN, 5 = metastatic, 9 = not known
                SpecimenNature => { code => 9 },

            # One of TOPOGRAPHY (SNOMED), TOPOGRAPHY (SNOMED CT) or PRIMARY
            # DIAGNOSIS (ICD PATHOLOGICAL) is required for the schema. It is
            # expected that most submissions will include TOPOGRAPHY (SNOMED).

            # PRIMARY DIAGNOSIS (ICD PATHOLOGICAL) is the PRIMARY DIAGNOSIS
            # based on the evidence from a pathological examination. Format
            # CXX.X or DXX.X
                # PrimaryDiagnosisICDPath => sample_to_snomed($v),

            # topography code (primary site of origin):
                TopographySNOMED => topography_code($v),
            },
        );
        $HAEM{HaematologyCore} = \%h;
    }
    { # COSDRecord/Haematology/HeamatologyContent/LaboratoryResults:
        my %h = (
            HighThroughputSequencing => $v->{high_throughput_sequencing},
            Immunocytochemistry      => $v->{immunocytochemistry},
            FlowCytometry            => $v->{flow_cytometry},
            Cytogenetics             => $v->{cytogenetics},
            MicroArray               => $v->{micro_array},
            Molecular                => $v->{molecular},
            FISH                     => $v->{fish},
        );
        $HAEM{HaematologyContent}{LaboratoryResults} = \%h;
    }
    $COSDRecord{Haematology} = \%HAEM;
    return \%COSDRecord;
}

#-------------------------------------------------------------------------------
# free-text specimen/biopsy_site/morphology/comment, etc:
sub free_text_report {
    my $vars = shift; # p $vars;

    my @rows;
    push @rows, ( 'SPECIMEN(S): ' . $vars->{specimen_decode} );
    if ( my $biopsy_site = $vars->{biopsy_site} ) {
        push @rows, ( 'ANATOMICAL SITE(S): ' . $biopsy_site );
    }
    push @rows, ( 'SPECIMEN QUALITY: ' . $vars->{specimen_quality} );
    if ( my $gross_desc = $vars->{gross_description} ) {
        push @rows, ( 'GROSS DESCRIPTION: ' . $gross_desc );
    }
    if ( my $morphology = $vars->{morphology} ) {
        push @rows, ( 'MORPHOLOGY: ' . $morphology );
    }
    if ( my $comment = $vars->{comment} ) {
        push @rows, ( 'COMMENT: ' . $comment );
    }
    push @rows, ( 'DIAGNOSIS: ' . $vars->{diagnosis} );
    push @rows, ( 'REPORTED BY: ' . $vars->{reporter_name} );
    return join "\n", @rows;
}

#-------------------------------------------------------------------------------
# merge vertical table data with $vars
sub combine_data {
    my ($vars, $supplimentary_data) = @_; # p $supplimentary_data;

    { # add data-map fields:
        # result summaries:
        map $vars->{$_} = $supplimentary_data->{result_summary}->{$_},
            qw( immunocytochemistry flow_cytometry fish cytogenetics molecular
                micro_array high_throughput_sequencing );
        # reporter:
        map $vars->{$_} = $supplimentary_data->{reporter_data}->{$_},
            qw(reporter_name reporter_code);
        # specimen code(s) & decode(s):
        $vars->{specimen_code}
            = $supplimentary_data->{specimen}->{specimen_code};
        $vars->{specimen_decode} # not needed ?
            = $supplimentary_data->{specimen}->{description};
    } # p $vars;

    # tidy up (trim content & substitute new-line markers):
    map { $vars->{$_} = tidy($vars->{$_}) }
        grep $vars->{$_},
            qw(comment immunocytochemistry flow_cytometry fish gross_description
                micro_array high_throughput_sequencing cytogenetics molecular);

    $vars->{gender} ||= 'U'; # p $vars;
    return $vars;
}

#-------------------------------------------------------------------------------
sub topography_code {
    my $vars = shift;

    return 'C80.9' unless $vars->{icdo3};

    my ($icdo3) = $vars->{icdo3} =~ /^(\d{4})/; # capture 1st 4 digits;

    # plasmacytomas? CLL? EMZL? - check the MZL & MYD88-mutated entries!!

    my %topo_code = (
        8720 => 'C44._', # metastatic melanoma
        9689 => 'C42.2', # splenic MZL
        9699 => 'C80.9', # extranodal MZL (Rule D)
        9700 => 'C44._', # MF
        9709 => 'C44._', # cuteneous TCL
        9718 => 'C44._', # primary cuteneous CD30+ LPD
        # 9731 => 'C40._', # plasmacytoma - only of bone
        9732 => 'C42.1', # myeloma
        9733 => 'C42.1', # plasma-cell leukaemia
    );
    return $topo_code{$icdo3} if $topo_code{$icdo3};

    # primary non-haematological tumour/metastatic carcinoma:
    return 'C80.9' if in_num($icdo3, 8000, 8010);
    # rhabdomyosarcoma/pnet/neuroblastoma:
    return 'C80.9' if in_num($icdo3, 8900, 9260, 9500);
    # mastocytosis, histiocytosis, langerhans:
    return 'C80.9' if between($icdo3, 9741 => 9755);
    # lymphomatoid granulomatosis, amyloidosis:
    return 'C80.9' if between($icdo3, 9766 => 9769);
    # Rule E: all leukaemias (9801/3 .. 9989/3):
    return 'C42.1' if between($icdo3, 9801 => 9990);
    # Rule D: default for lymphomas (9591/3 .. 9739/3)
    return 'C77._' if between($icdo3, 9591 => 9739);
    # escaped all above - debug:
    p $vars->{diagnosis};
}

#-------------------------------------------------------------------------------
sub between { # adapated from Acme::Tools
  my ($test, $lower, $upper) = @_;

  return $lower < $upper
    ? $test >= $lower && $test <= $upper
    : $test >= $upper && $test <= $lower;
}

#-------------------------------------------------------------------------------
sub in_num { # lifted from Acme::Tools
  my $val = shift || return 0; # shift 1st val, remaining in @_
  for (@_) { return 1 if $_ == $val }
  return 0;
}

#-------------------------------------------------------------------------------
sub icdo3_to_snomed { # eg 9960/3 -> M99603
    local $_ = shift || return undef; # p $icdo3;
    m!(\d{4})/(\d)!;
    return 'M' . $1 . $2;
}

#-------------------------------------------------------------------------------
sub gender_code {
    my $gender = shift;
    return 1 if $gender eq 'M';
    return 2 if $gender eq 'F';
    return 9; # x = not known, 9 = not specified
}

#-------------------------------------------------------------------------------
# first + optional middle names
sub given_name {
    my $vars = shift;
    return join ' ', grep $_, @{$vars}{ qw/first_name middle_name/ };
}

#-------------------------------------------------------------------------------
sub xs_options {
    my %opts = (
        SuppressEmpty => 0, # ? only works for xml_in
        RootName      => 'COSD',
        XMLDecl       => q!<?xml version="1.0" encoding="UTF-8"?>!,
        KeyAttr       => [],
        NoAttr        => 1, # gives inline (scalar) AND nested (arrayref) attributes
    );
    return %opts;
}

#-------------------------------------------------------------------------------
sub transfer_file { # only called if not $JUST_TESTING
    my $args = shift;

    my $cfg = $settings->{nycris_server};

    my %params = (
        local_filename => $args->{local_filename},
        server_addr    => $cfg->{remote_address},
        username       => $cfg->{username},
        password       => $cfg->{password},
    ); # p %params;

    # scp file (returns str on failure, undef on success):
    my $rtn = $tools->scp_file(\%params); # p $rtn;

    my $script = $tools->script_filename;

    if ($rtn) {
        $tools->mail_admin({ script => $0, msg => $rtn });
        warn "$script - $rtn\n"; # dump to logs but don't die!!
    }
    else {
        archive_and_delete_file($args->{local_filename});

        my $month = $ref_datetime->month_name; # for cron log:
        print "$script reports data for $month succesfully uploaded\n";
    }
}

#-------------------------------------------------------------------------------
# archive data file (only called if not $JUST_TESTING):
sub archive_and_delete_file {
    my $src_file = shift; # p $src_file;

	my $path_to_archive = $settings->{nycris_server}->{path_to_archive};

    # make sure it exists otherwise get error trying to tar non-existant file
    if (-e $src_file ) { # warn 'it exists';
        my $tar_file = sprintf '%s/%s.tar',
            $path_to_archive, $data_filename; # p $tar_file;

		chdir TMP_DIR; # so we can use relative path for source directory

		system( sprintf 'tar -cf %s %s', $tar_file, $data_filename ); # using relative $data_file.* not full path
		system( sprintf "gzip $tar_file" ); # compress tar file
        # delete source file:
        io($src_file)->unlink;
    }
}

#-------------------------------------------------------------------------------
sub get_user_code {
    my $username = shift;

    return $settings->{national_codes}->{$username}
    || 'H9999998'; # OTHER HEALTH CARE PROFESSIONAL
}

#-------------------------------------------------------------------------------
sub get_data_map {
    my $request_ids = shift;

    my $map = {};

    { # request_specimens:
        my $result = do {
            my $sql = $sql_lib->retr('nycris_request_specimens'); # p $sql;
            $dbix->query($sql, @$request_ids);
        };
        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id = $vars->{request_id};

            my %data = (
                specimen_code => $vars->{code},
                description   => $vars->{description},
            ); # p %data;
            $map->{$request_id}->{specimen} = \%data;
        }
    }
    { # reporter data:
        my $result = do {
            my $sql = $sql_lib->retr('nycris_reporters'); # p $sql;
            $dbix->query($sql, @$request_ids);
        };
        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id = $vars->{request_id};

            my $reporter = join ' ',
                ucfirst $vars->{first_name},
                ucfirst $vars->{last_name};

            my $reporter_code = get_user_code($vars->{username});

            my %data = (
                reporter_name => $reporter,
                reporter_code => $reporter_code,
            );
            $map->{$request_id}->{reporter_data} = \%data;
        }
    }
    { # result summaries:
        my $sql = $sql_lib->retr('nycris_result_summaries');
        my $result = $dbix->query($sql, @$request_ids);

        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id  = $vars->{request_id};
            my $section     = $vars->{section_name};
            my $result      = $vars->{results_summary};

            $section =~ s/ /_/g; # make section suitable for use as hash key
            $map->{$request_id}->{result_summary}->{lc $section} = $result;
        } # p $map->{result_summary};
    } # p $map;

    return $map;
}

#-------------------------------------------------------------------------------
sub tidy {
    my $str = shift;

    # trim:
    $str =~ s/\A\s+//; # leading
    $str =~ s/\s+\Z//; # trailing

#    $str =~ s/(\r\n)/$new_line_marker/g; # doesn't capture just \n
    $str =~ s/\r?\n/$new_line_marker/g; # captures both \r\n & \n

    return $str;
}

#-------------------------------------------------------------------------------
=begin # all done in 1 query now
sub get_request_ids {
    my @request_ids = (); # return [ 264710, 204466 ];
    { # new diagnoses:
        my $sql = $sql_lib->retr('ncrs_new_diagnosis_request_ids');
        my $request_ids = $dbix->query($sql, $duration, $duration)->flat;
        push @request_ids, @$request_ids; # p $request_ids;
    }
    { # get revised diagnoses:
        my $sql = $sql_lib->retr('ncrs_revised_diagnosis_request_ids');
        my $request_ids = $dbix->query($sql, $duration, $duration)->flat;
        push @request_ids, @$request_ids; # p $request_ids;
    }
    # combine new & revised diagnosis request ID's; eliminate duplicates:
    my %request_ids = map { $_ => 1 } @request_ids; # p %request_ids;
    # return unique ID's as arrayref:
    return [ keys %request_ids ];
}
=cut

=begin # not mapping samples to snomed
# decode for biopsy_site info:
my $anatomical_site_map =
    $dbix->select('anatomical_sites', [qw(site_name snomed)])->map;
    # p $anatomical_site_map;

sub sample_to_snomed {
    my $vars = shift;

    my @snomed;
    # get info from biopsy_site if available, otherwise specimen code(s):
    if ( my $biopsy_site = $vars->{biopsy_site} ) { # optional
        my @sites = split '; ', $biopsy_site; # maybe multiple
        push @snomed, $anatomical_site_map->{$_} for @sites;
    }
    else {
        my @specimens = split ',', $vars->{specimen_code};
        push @snomed, $specimen_topography{$_} for @specimens;
    } # p @snomed;

    # legacy biopsy site that doesn't match anatomical_sites.site_name:
    return { code => 'C80.9' } unless @snomed; # site unknown

    return scalar @snomed == 1
        ? { code => $snomed[0] }
        : [ map { code => $_ }, @snomed ];
}

# for non-histology specimens (sample_code NOT RLIKE '[DGLRX]([BS]L|F|U)'):
# NB these are ICDO3 topography codes for site of tumour origin
my %specimen_topography = (
    BMA     => 'C42.1',
    BMAT    => 'C42.1',
    CF      => '',
    CHIA    => 'C42.0', # should never issue new diagnosis on CHIA
    CHIB    => 'C42.0', # should never issue new diagnosis on CHIB
    CHIM    => 'C42.0', # should never issue new diagnosis on CHIM
    CMP     => '',      # should never issue new diagnosis on CMP
    EF      => '',
#   HS      => '', # require biopsy_site now
#   LA      => 'C77.9', # require biopsy_site now
    PB      => 'C42.0',
    SE      => '', # never issue new diagnosis on SE
    TBL     => 'C42.1',
    TBP     => 'C42.1',
    TSL     => 'C42.1',
    URI     => '', # never issue new diagnosis on URI
#   XA      => '', # require biopsy_site now
    RXPB    => 'C42.0', # should never issue new diagnosis on RXPB
    TSM     => '', # never issue new diagnosis on TSM
);
=cut
