P3X GitList

#!/usr/bin/perl

=begin
--------------------------------------------------------------------------------
generates NCRS XML data feed for new and revised diagnoses [or all icdo3 '/3' reports
authorised or revised - change nycris to ncrs in libray.sql] during previous 1 month

SCP file transfer

manual command-line:
    ncrs_cosd.pl [duration in months - optional (defaults to 1)]
    scp /tmp/<filename> hmds@163.160.247.17:<filename>

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!! for new server: need to manually ssh into server once to add key to known_hosts
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

online reference:
ICDO3 topographical codes: http://codes.iarc.fr/topography
ICDO3 morphological codes: http://codes.iarc.fr/codegroup/2
--------------------------------------------------------------------------------
=cut

my $JUST_TESTING = 1; # skips file transfer & file archive

use lib '/home/raj/perl5/lib/perl5';
use XML::Simple qw(:strict); # need to specify ForceArray & KeyAttr if using :strict here
use Data::Printer;
use Config::Auto;
use Data::Dumper;
use Modern::Perl;
use DateTime; # for last_day_of_month()
use IO::All;

use FindBin qw($Bin); # warn $Bin;
use lib $Bin . '/../../../lib';
use LIMS::Local::ScriptHelpers;

#############################################
my $delimiter = q{|};                       #
my $duration  = $ARGV[0] || 1; # months ago #
my $new_line_marker = '[NEW_LINE]';         #
#############################################

my $cfg_file = "$Bin/../lib/settings.cfg";
my $settings = Config::Auto::parse($cfg_file); # p $settings;

# get tools from LIMS::Local::ScriptHelpers:
my $tools = LIMS::Local::ScriptHelpers->new();

my $sql_lib = $tools->sql_lib();
my $config  = $tools->config();
my $dbix    = $tools->dbix();

# dates ========================================================================
# get date last month - handles BST:
my $ref_datetime = $tools->date_subtract(months => $duration);
my $timestamp    = $tools->time_now->datetime;

my $start_date = $ref_datetime->strftime('%Y-%m-01');
my $end_date   = DateTime->last_day_of_month(
    year  => $ref_datetime->year,
    month => $ref_datetime->month,
)->ymd; # p $start_date; p $end_date;
# ==============================================================================

my $data_filename = sprintf 'hmds_%s_%02d.txt',
    $ref_datetime->year, $ref_datetime->month; # p $data_filename;
# $data_filename = 'hmds_test_cosd.xml';

use constant HMDS_ORG_CODE => 'RR813'; # OrgCodePathReport & OrgCodeSubmitter
use constant TMP_DIR => '/tmp'; # where to create temp data file

# decode for biopsy_site info:
my $anatomical_site_map =
    $dbix->select('anatomical_sites', [qw(site_name snomed)])->map;
    # p $anatomical_site_map;

# for non-histology specimens (sample_code NOT RLIKE '[DGLRX]([BS]L|F|U)'):
# NB these are ICDO3 topography codes for site of tumour origin
my %specimen_topography = (
    BMA     => 'C42.1',
    BMAT    => 'C42.1',
    CF      => '',
    CHIA    => 'C42.0', # should never issue new diagnosis on CHIA
    CHIB    => 'C42.0', # should never issue new diagnosis on CHIB
    CHIM    => 'C42.0', # should never issue new diagnosis on CHIM
    CMP     => '',      # should never issue new diagnosis on CMP
    EF      => '',
#   HS      => '', # require biopsy_site now
#   LA      => 'C77.9', # require biopsy_site now
    PB      => 'C42.0',
    SE      => '', # never issue new diagnosis on SE
    TBL     => 'C42.1',
    TBP     => 'C42.1',
    TSL     => 'C42.1',
    URI     => '', # never issue new diagnosis on URI
#   XA      => '', # require biopsy_site now
    RXPB    => 'C42.0', # should never issue new diagnosis on RXPB
    TSM     => '', # never issue new diagnosis on TSM
);

my @data = ();
{ # go:
    # get request ID's for horizontal (1-to-1) & vertical (1-to-many) tables:
    my $request_ids = get_request_ids(); # p $request_ids;

    # get request result 1-to-1 data:
    my $query = $sql_lib->retr('ncrs_data'); # p $query;
    my $result = $dbix->query($query, @$request_ids);

    # get data maps of vertical tables - much faster than tying into main query:
    my $data_map = get_data_map($request_ids); # p $data_map;

    while ( my $vars = $result->hash ) { # p $vars; next;
        my $request_id = $vars->{request_id};
        # merge vertical table data with $vars:
        combine_data($vars, $data_map->{$request_id});  p $vars;
        push @data, $vars;
    } # p @data;
}

{ # create output file & push to remote server:
    my $file_path = join '/', TMP_DIR, $data_filename;

    my $content = to_xml(\@data); # p $content;
    $content > io($file_path);

    # transfer file:
    unless ($JUST_TESTING) {
        my %args = ( local_filename => $file_path );
        transfer_file(\%args);
    }
}

#-------------------------------------------------------------------------------
sub to_xml {
    my $data = shift; # arrayref of request datasets

    my $count = @$data;

    # options for XMLout (needs array or hash):
    my %xs_opts = xs_options(); # p %xs_opts;
    my $xs = XML::Simple->new(%xs_opts);

    my $ref = {
        Id => {
            root => '18DA78B8-44A4-4AA7-95B0-87C198E6B0AE', # where does this come from ?
        },
        OrgCodeSubmitter => {
            extension => HMDS_ORG_CODE,
        },
        RecordCount => {
            value => $count,
        },
        ReportingPeriodStartDate => $start_date,
        ReportingPeriodEndDate   => $end_date,
        FileCreationDateTime     => $timestamp,
    };

    push @{ $ref->{COSDRecord} }, format_cosd_unit($_) for @$data; # p $ref;

    # enclose xml in outer <add> block; add ulisa 'version' inline:
    # my $input = { add => $vars, version => 2 };
   	my $xml_out = $xs->XMLout($ref); # p $xml_out;
    return $xml_out;
}

#-------------------------------------------------------------------------------
# formats repeating COSDRecord blocks:
sub format_cosd_unit {
    my $v = shift;

    my %COSDRecord = (
        Id => {
            root => 'B63AFD92-568E-4845-A0CE-2643FE15DE08', # where does this come from ?
        },
    );
    my %HAEM;
    { # COSDRecord/Haematology/HaematologyCore:
        my %h = (
            HaematologyCoreLinkagePatientId => {
                NHSNumber => {
                    extension => $v->{nhs_number},
                },
                NHSNumberStatusIndicator => {
                    code => '02', # Number present but not traced
                },
                LocalPatientId => $v->{unit_number},
                Birthdate => $v->{dob},
                OrgCodeOfProvider => {
                    extension => $v->{organisation_code},
                },
            },
            HaematologyCoreLinkageDiagnosis => {
                InvestigationResultDate => $v->{authorisation_date},
                ServiceReportId => $v->{labno},
            },
            HaematologyCoreDemographics => {
                PersonFamilyName => {
                    family => $v->{last_name},
                },
                PersonGivenName => {
                    given => given_name($v),
                },
                Address => {
                    UnstructuredAddress => {
                        streetAddressLine => $v->{address},
                    },
                },
                Postcode => {
                    postalCode => $v->{post_code},
                },
                Gender => {
                    code => gender_code($v->{gender}),
                },
            },
            HaematologyCorePathology => {
                PathologistConsultantCode => { extension => $v->{reporter_code} },
                PathTestReqCareProfCode   => { extension => $v->{referrer_code} },
                SampleCollectionDate      => { extension => $v->{sample_date}   },
                OrgCodePathReport         => { extension => HMDS_ORG_CODE },
                ServiceReportStatus       => { code => 1 }, # final (complete)

                PathTestReqSiteCode => { extension => $v->{organisation_code} },
                PathologyReportText => free_text_report($v),
                SampleReceiptDate   => $v->{request_date},
                MorphologySNOMED    => icdo3_to_snomed($v->{icdo3}), # ICDO3

                # useless for haematology - refers to solid tumours:
                # 1 = primary, 4 = region LN, 5 = metastatic, 9 = not known
                SpecimenNature => { code => 9 },
                # specimen code (maybe multiple) - not sure yet which to use:
                # TopographySNOMED => sample_to_snomed($v),
                # PrimaryDiagnosisICDPath => sample_to_snomed($v),
            },
        );
        $HAEM{HaematologyCore} = \%h;
    }
    { # COSDRecord/Haematology/HeamatologyContent/LaboratoryResults:
        my %h = (
            HighThroughputSequencing => $v->{high_throughput_sequencing},
            Immunocytochemistry      => $v->{immunocytochemistry},
            FlowCytometry            => $v->{flow_cytometry},
            Cytogenetics             => $v->{cytogenetics},
            MicroArray               => $v->{micro_array},
            Molecular                => $v->{molecular},
            FISH                     => $v->{fish},
        );
        $HAEM{HaematologyContent}{LaboratoryResults} = \%h;
    }
    $COSDRecord{Haematology} = \%HAEM;
    return \%COSDRecord;
}

#-------------------------------------------------------------------------------
# free-text specimen/biopsy_site/morphology/comment, etc:
sub free_text_report {
    my $vars = shift;

    my @rows;
    push @rows, ( 'SPECIMEN(S): ' . $vars->{specimen_decode} );
    if ( my $biopsy_site = $vars->{biopsy_site} ) {
        push @rows, ( 'ANATOMICAL SITE(S): ' . $biopsy_site );
    }
    push @rows, ( 'SPECIMEN QUALITY: ' . $vars->{specimen_quality} );
    if ( my $gross_desc = $vars->{gross_description} ) {
        push @rows, ( 'GROSS DESCRIPTION: ' . $gross_desc );
    }
    if ( my $morphology = $vars->{morphology} ) {
        push @rows, ( 'MORPHOLOGY: ' . $morphology );
    }
    if ( my $comment = $vars->{comment} ) {
        push @rows, ( 'COMMENT: ' . $comment );
    }
    push @rows, ( 'DIAGNOSIS: ' . $vars->{diagnosis} );

    return join "\n", @rows;
}

#-------------------------------------------------------------------------------
# merge vertical table data with $vars - no return needed (working on hashref)
sub combine_data {
    my ($vars, $supplimentary_data) = @_; # p $supplimentary_data;

    { # add data-map fields:
        # result summaries:
        map $vars->{$_} = $supplimentary_data->{result_summary}->{$_},
            qw( immunocytochemistry flow_cytometry fish cytogenetics molecular
                micro_array high_throughput_sequencing );
        # reporter:
        map $vars->{$_} = $supplimentary_data->{reporter_data}->{$_},
            qw(reporter_name reporter_code);
        # specimen code(s) & decode(s):
        $vars->{specimen_code}
            = $supplimentary_data->{specimen}->{specimen_code};
        $vars->{specimen_decode} # not needed ?
            = $supplimentary_data->{specimen}->{description};
    } # p $vars;

    # tidy up (trim content & substitute new-line markers):
    map { $vars->{$_} = tidy($vars->{$_}) }
        grep $vars->{$_},
            qw(comment immunocytochemistry flow_cytometry fish gross_description
                micro_array high_throughput_sequencing cytogenetics molecular);

    $vars->{gender} ||= 'U'; # p $vars;
    return 0; # updating $vars hashref, return not required
}

#-------------------------------------------------------------------------------
sub sample_to_snomed {
    my $vars = shift;

    my @snomed;
    # get info from biopsy_site if available, otherwise specimen code(s):
    if ( my $biopsy_site = $vars->{biopsy_site} ) { # optional
        my @sites = split '; ', $biopsy_site; # maybe multiple
        push @snomed, $anatomical_site_map->{$_} for @sites;
    }
    else {
        my @specimens = split ',', $vars->{specimen_code};
        push @snomed, $specimen_topography{$_} for @specimens;
    } # p @snomed;

    # legacy biopsy site that doesn't match anatomical_sites.site_name:
    return { code => 'C80.9' } unless @snomed; # site unknown

    return scalar @snomed == 1
        ? { code => $snomed[0] }
        : [ map { code => $_ }, @snomed ];
}

#-------------------------------------------------------------------------------
sub icdo3_to_snomed { # eg 9960/3 -> M99603
    local $_ = shift || return undef; # p $icdo3;
    m!(\d{4})/(\d)!;
    return 'M' . $1 . $2;
}

#-------------------------------------------------------------------------------
sub gender_code {
    my $gender = shift;
    return 1 if $gender eq 'M';
    return 2 if $gender eq 'F';
    return 9; # x = not known, 9 = not specified
}

#-------------------------------------------------------------------------------
# first + optional middle names
sub given_name {
    my $vars = shift;
    return join ' ', grep $_, @{$vars}{ qw/first_name middle_name/ };
}

#-------------------------------------------------------------------------------
sub xs_options {
    my %opts = (
        SuppressEmpty => 0, # ? only works for xml_in
        RootName      => 'COSD',
        XMLDecl       => q!<?xml version="1.0" encoding="UTF-8"?>!,
        KeyAttr       => [],
        NoAttr        => 1, # gives inline (scalar) AND nested (arrayref) attributes
    );
    return %opts;
}

#-------------------------------------------------------------------------------
sub get_request_ids {
    my @request_ids = (); # return [ 264710, 204466 ];
    { # new diagnoses:
        my $sql = $sql_lib->retr('ncrs_new_diagnosis_request_ids');
        my $request_ids = $dbix->query($sql, $duration, $duration)->flat;
        push @request_ids, @$request_ids; # p $request_ids;
    }
    { # get revised diagnoses:
        my $sql = $sql_lib->retr('ncrs_revised_diagnosis_request_ids');
        my $request_ids = $dbix->query($sql, $duration, $duration)->flat;
        push @request_ids, @$request_ids; # p $request_ids;
    }
    # combine new & revised diagnosis request ID's; eliminate duplicates:
    my %request_ids = map { $_ => 1 } @request_ids; # p %request_ids;
    # return unique ID's as arrayref:
    return [ keys %request_ids ];
}

#-------------------------------------------------------------------------------
sub transfer_file { # only called if not $JUST_TESTING
    my $args = shift;

    my $cfg = $settings->{nycris_server};

    my %params = (
        local_filename => $args->{local_filename},
        server_addr    => $cfg->{remote_address},
        username       => $cfg->{username},
        password       => $cfg->{password},
    ); # p %params;

    # scp file (returns str on failure, undef on success):
    my $rtn = $tools->scp_file(\%params); # p $rtn;

    my $script = $tools->script_filename;

    if ($rtn) {
        $tools->mail_admin({ script => $0, msg => $rtn });
        warn "$script - $rtn\n"; # dump to logs but don't die!!
    }
    else {
        archive_and_delete_file($args->{local_filename});

        my $month = $ref_datetime->month_name; # for cron log:
        print "$script reports data for $month succesfully uploaded\n";
    }
}

#-------------------------------------------------------------------------------
# archive data file (only called if not $JUST_TESTING):
sub archive_and_delete_file {
    my $src_file = shift; # p $src_file;

	my $path_to_archive = $settings->{nycris_server}->{path_to_archive};

    # make sure it exists otherwise get error trying to tar non-existant file
    if (-e $src_file ) { # warn 'it exists';
        my $tar_file = sprintf '%s/%s.tar',
            $path_to_archive, $data_filename; # p $tar_file;

		chdir TMP_DIR; # so we can use relative path for source directory

		system( sprintf 'tar -cf %s %s', $tar_file, $data_filename ); # using relative $data_file.* not full path
		system( sprintf "gzip $tar_file" ); # compress tar file
        # delete source file:
        io($src_file)->unlink;
    }
}

#-------------------------------------------------------------------------------
sub get_user_code {
    my $username = shift;

    return $settings->{national_codes}->{$username}
    || 'H9999998'; # OTHER HEALTH CARE PROFESSIONAL
}

#-------------------------------------------------------------------------------
sub get_data_map {
    my $request_ids = shift;

    my $map = {};

    { # request_specimens:
        my $result = do {
            my $sql = $sql_lib->retr('nycris_request_specimens'); # p $sql;
            $dbix->query($sql, @$request_ids);
        };
        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id = $vars->{request_id};

            my %data = (
                specimen_code => $vars->{code},
                description   => $vars->{description},
            ); # p %data;
            $map->{$request_id}->{specimen} = \%data;
        }
    }
    { # reporter data:
        my $result = do {
            my $sql = $sql_lib->retr('nycris_reporters'); # p $sql;
            $dbix->query($sql, @$request_ids);
        };
        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id = $vars->{request_id};

            my $reporter = join ' ',
                ucfirst $vars->{first_name},
                ucfirst $vars->{last_name};

            my $reporter_code = get_user_code($vars->{username});

            my %data = (
                reporter_name => $reporter,
                reporter_code => $reporter_code,
            );
            $map->{$request_id}->{reporter_data} = \%data;
        } # p $map->{reporter_data};
    }
    { # result summaries:
        my $sql = $sql_lib->retr('nycris_result_summaries');
        my $result = $dbix->query($sql, @$request_ids);

        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id  = $vars->{request_id};
            my $section     = $vars->{section_name};
            my $result      = $vars->{results_summary};

            $section =~ s/ /_/g; # make section suitable for use as hash key
            $map->{$request_id}->{result_summary}->{lc $section} = $result;
        } # p $map->{result_summary};
    }

    return $map;
}

#-------------------------------------------------------------------------------
sub tidy {
    my $str = shift;

    # trim:
    $str =~ s/\A\s+//; # leading
    $str =~ s/\s+\Z//; # trailing

#    $str =~ s/(\r\n)/$new_line_marker/g; # doesn't capture just \n
    $str =~ s/\r?\n/$new_line_marker/g; # captures both \r\n & \n

    return $str;
}