P3X GitList

#!/usr/bin/env perl

=begin
--------------------------------------------------------------------------------
generates NCRS XML data file for all authorised requests with diagnosis either
ICDO3 '/3', or 'see comments' with a non-null comment section, where authorisation
date or diagnosis revision date during previous 1 month

test mode skips file transfer & archive, dumps data to file (/tmp dir)

* results summaries combined with PathologyReportText until schema evolves to
  accomodate them, then will need to revert as_cdata($result) & to_camel_case()

usage:
    CENTRE=<centre> $0 [-m <months> - optional (defaults to 1)][-t - testing]
    # scp /tmp/<filename> hmds@163.160.247.17:<filename> # broken

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!! for new server: need to manually ssh into server once to add key to known_hosts
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

online reference:
Using ICD-O-3 online: http://codes.iarc.fr/usingicdo.php
ICDO3 topographical codes: http://codes.iarc.fr/topography
ICDO3 morphological codes: http://codes.iarc.fr/codegroup/2

ICD-O-3 to ICD-10 conversion for hemato-oncological diseases:
<http://www.krebsdaten.de/Krebs/DE/Content/Publikationen/Poster/Downloads/2014/
hemato-oncological_diseases_classification_ottawa_engl.pdf?__blob=publicationFile>

All leukemias have a site-specific code of bone marrow (C42.1) except myeloid sarcoma
Multiple myeloma � code site to C42.1
RULE D. Topography codes for lymphomas: If a lymphoma involves multiple lymph node regions,
code to C77.8 (lymph nodes of multiple regions). Code extranodal lymphomas to the site of
origin, which may not be the site of the biopsy. If no site is indicated for a lymphoma,
code to C77.9 (lymph node, NOS). C80.9 if suspected to be extranodal but site is not stated.
--------------------------------------------------------------------------------
=cut

use Getopt::Std;
getopts('m:tq'); # months, testing, sql-trace
our($opt_m,$opt_t,$opt_q); # warn $opt_m; exit;

use charnames ':full'; # for perl < 5.16 - to use unicode names (eg LINE FEED)

use lib '/home/raj/perl5/lib/perl5';

use Encoding::FixLatin qw/fix_latin/;
use Data::GUID qw(guid_string);
use XML::Simple qw(:strict); # need to specify ForceArray & KeyAttr if using :strict here
use XML::SAX::ParserFactory;
use XML::Validator::Schema;
use XML::Compile::Schema;
use SQL::Abstract::More;
use Data::Printer;
use Config::Auto;
use Modern::Perl;
use XML::LibXML;
use Try::Tiny;
use DateTime; # last_day_of_month()
use IO::All;

use FindBin qw($Bin); # warn $Bin;
use lib $Bin . '/../../../lib';
use LIMS::Local::ScriptHelpers;

use lib '/home/raj/perl-lib';
use Local::DB; # switch off date auto-inflation:
$Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 1;

my $centre = $ENV{CENTRE} || die 'no centre param supplied';

#===============================================================================
my $JUST_TESTING    = $opt_t || 0; # skips file transfer & file archive
my $duration        = $opt_m || 1; # months ago
my $new_line_marker = '[~~]'; # using CDATA now
#===============================================================================

$ENV{SQL_TRACE} = $opt_q;

# supply 'SuppressEmpty' arg (1 or undef) to XMLOut: ---------------------------
=begin
    # SuppressEmpty => 1:
    <Address>
        <UnstructuredAddress />
    </Address>
    # SuppressEmpty => undef:
    <Address>
        <UnstructuredAddress>
            <streetAddressLine></streetAddressLine>
        </UnstructuredAddress>
    </Address>
=cut
my $suppress_empty = 1; # options 1 or undef to suppress warning, 0 gives warning
#-------------------------------------------------------------------------------

my $cfg_file = "$Bin/../lib/settings.cfg";
my $settings = Config::Auto::parse($cfg_file); # p $settings;

my $ncrs_cfg = $settings->{ncrs_data}->{$centre}
    || die "can't find settings for $centre in $cfg_file"; # p $ncrs_cfg;
# organisation code where service (HMDS, NBT, UCLH, etc) located:
my $service_org_code = $ncrs_cfg->{org_code}
    || die 'service org code not configured';
my $hmds_org_code = 'RR8F4'; # apparently

# get tools from LIMS::Local::ScriptHelpers:
my $tools = LIMS::Local::ScriptHelpers->new();

my $config  = $tools->config();
my $dbname  = $centre eq 'leeds' ? 'hilis4' : $centre;
my $dbix    = Local::DB->dbix({ dbname => $dbname }); # warn $dbix->dbh->{Name};

# dates ========================================================================
my $timestamp = $tools->time_now->datetime;
# get date last month - handles BST:
my $ref_date = $tools->date_subtract(months => $duration);

# format for ReportingPeriodStartDate & ReportingPeriodEndDate
my $start_date = $ref_date->strftime('%Y-%m-01');
my $end_date   = DateTime->last_day_of_month(
    year  => $ref_date->year,
    month => $ref_date->month,
)->ymd; # p $start_date; p $end_date;
# ==============================================================================

my @lab_sections = $dbix->select('lab_sections','section_name')->column;
# remove non-word chars to match result_summary data from get_data_map():
$_ = to_camel_case($_) for @lab_sections; # p @lab_sections; exit;

# The submission file must be named using the following convention:
# COSD_<FILE SOURCE>_<Submitting Org>_<Reporting Period Start Date>_
#    <Reporting Period End Date>_<Date of file creation>.xml
my $data_filename = sprintf 'COSD_PATH_%s_%s_%s_%s.xml',
    $service_org_code, $start_date, $end_date, $timestamp; # p $data_filename;
# $data_filename = 'hmds_test_cosd.xml';

my @skip_screens = ( # not very maintainable as new ones added!!
    'Inappropriate/unsuitable sample',
    'Myeloma trial follow-up',
    'Myeloma XI follow-up',
    'Follow-up CML (PB)',
    'Follow-up CML (BM)',
    'CLL trial follow-up',
    'Chimerism (CHIA/CHIB)',
    'Chimerism (CHIM)',
    'ACE-WM follow-up',
    'HIV monitoring',
    'NCG PNH (PB)',
    'NCG PNH (BM)',
    'Outreach',
);

use constant TMP_DIR => '/tmp'; # where to create temp data file
my %all_guids; # GUID counter to ensure uniqueness

my @data = ();
{ # go:
    # get requests 1-to-1 data:
    my @requests = do {
        my ($sql, @bind) = _ncrs_data();
        $dbix->query($sql, @bind)->hashes;
    }; # p @requests; exit;

    my @request_ids = map $_->{request_id}, @requests; # p @request_ids;
    # get data maps of vertical tables - much faster than tying into main query:
    my $data_map = get_data_map(\@request_ids); # p $data_map;

    for my $r (@requests) { # p $r; next;
        my $request_id = $r->{request_id};
        # merge vertical table data with $r:
        my $vars = combine_data($r, $data_map->{$request_id}); # p $r;
        push @data, $vars;
    } # p @data;
}
# enable hash counter in new_guid() for debugging:
say "$_ is not unique" for grep { $all_guids{$_} > 1 } keys %all_guids;

{ # create output file & push to remote server:
    my $file_path = join '/', TMP_DIR, $data_filename;
    # generate xml (validation failure warns error & returns 0):
    if ( my $content = to_xml(\@data) ) { # p $content;
        $content > io($file_path);
        # transfer file:
        unless ($JUST_TESTING) {
            my %args = ( local_filename => $file_path );
            transfer_file(\%args);
        }
    }
}

#-------------------------------------------------------------------------------
sub to_xml {
    my $data = shift; #p $data; # arrayref of request datasets

    my $count = @$data; # p $count;

    my $ref = {
        OrganisationIdentifierCodeOfSubmittingOrganisation => { # always Leeds/HMDS:
            extension => $hmds_org_code,
        },
        Id => { root => new_guid() }, # returns new Data::GUID as_string
        RecordCount => { value => $count },
        ReportingPeriodStartDate => $start_date,
        ReportingPeriodEndDate   => $end_date,
        FileCreationDateTime     => $timestamp,
    }; # p $ref;

=begin # XML::Simple method (doesn't support ordered elements without hack):
    # options for XMLout (needs array or hash):
    my %xs_opts = xs_options(); # p %xs_opts;
    my $xs = XML::Simple->new(%xs_opts);

   	my $xml_out = $xs->XMLout($ref); # p $xml_out;
    # get repeating COSDRecords (horrible hack because XML::Simple doesn't
    # support ordered elements, create 2nd xml document and splice into outer one:
    {
        my %xs_opts = xs_options(); # p %xs_opts;
        $xs_opts{RootName} = undef; # suppress, don't want outer xml element
        delete $xs_opts{XMLDecl};   # delete, don't need for COSDRecord elements

        my $xs = XML::Simple->new(%xs_opts); # p %xs_opts;
        my @COSDRecords = map format_cosd_unit($_), @$data; # p @COSDRecords;
        my $records = $xs->XMLout({ COSDRecord => \@COSDRecords }); # p $records;
        # splice in cosd records:
        $xml_out =~ s{(</COSD:COSD>)}{$records . $1}e;
    }
=cut
#=begin # XML::LibXML method
    push @{ $ref->{COSDRecord} }, format_cosd_unit($_) for @$data; # p $ref;
    # schema created from dummy xml by https://www.freeformatter.com/xsd-generator.html#ad-output
    # had to change COSD:COSD in xml to COSD or outer element skipped:
    my $xsd = '/home/raj/apps/HILIS4/setup/schema/xsd/ncrs.xsd';
    my $schema = XML::Compile::Schema->new($xsd); # p $schema;

    # see what types are defined
      # $schema->printIndex; # exit;
      # p $schema->template('PERL' => 'COSD'); exit;

    # create and use a writer
    my $doc   = XML::LibXML::Document->new('1.0', 'UTF-8');
    my $write = $schema->compile(WRITER => 'COSD');
    my $xml   = $write->($doc, $ref);
    $doc->setDocumentElement($xml);
    # show result
    my $xml_out = $doc->toString(1);

    # change COSD to COSD:COSD (as colon not supported in xsd):
        # "error: field name contains 'COSD:COSD' which is not a valid NCName at COSD:COSD"
    $xml_out =~ s/COSD(?!Record)/COSD:COSD/g; # match <COSD> but not <COSDRecord>
#=cut

    # hack to add namespace data:
    chomp( my $namespace = namespace_data() ); # remove trailing new-line
    $xml_out =~ s/(COSD:COSD)/$1\n$namespace/; # don't use 'g', only want start-tag
    # p $xml_out;
    { # validate xml:
        # switch debug on to show data structure:
        my $validator = XML::Validator::Schema->new(file => $xsd, debug => 0); # p $validator;
        my $parser    = XML::SAX::ParserFactory->parser(Handler => $validator);
        my $err;
        try { $parser->parse_string($xml_out) }
        catch { $err = $_ };
        if ($err) {
            warn $err;
            return 0;
        }
    }
    return $xml_out;
}

#-------------------------------------------------------------------------------
# formats repeating COSDRecord blocks:
sub format_cosd_unit {
    my $v = shift;

    my %COSDRecord = (
        Id => {
            root => new_guid(), # returns new Data::GUID as_string
        },
    );
    my %HAEM;
    { # COSDRecord/Haematology/HaematologyCore:
        my $labno = $ncrs_cfg->{labno_prefix}
            . $v->{request_number} . '/' . ( $v->{year} - 2000 );
        my %h = (
            HaematologicalCoreLinkagePatientId => {
                NHSNumber => {
                    extension => $v->{nhs_number},
                },
                NHSNumberStatusIndicator => {
                    code => '02', # Number present but not traced
                },
                LocalPatientIdExtended => $v->{unit_number},
                Birthdate => $v->{dob},
                OrganisationIdentifierCodeOfProvider => {
                    extension => $v->{organisation_code},
                },
            },
            HaematologicalCoreDemographics => {
                PersonFamilyName => {
                    family => $v->{last_name},
                },
                PersonGivenName => {
                    given => given_name($v),
                },
                Address => {
                    UnstructuredAddress => {
                        streetAddressLine => $v->{address},
                    },
                },
                Postcode => {
                    postalCode => $v->{post_code},
                },
                Gender => {
                    code => gender_code($v->{gender}),
                },
            },
            HaematologicalCorePathology => {
                OrganisationSiteIdentifierOfPathologyTestRequest => {
                    extension => $v->{organisation_code}
                },
                OrganisationIdentifierOfReportingPathologist => {
                    extension => $service_org_code
                },
                PathologistConsultantCode => {
                    extension => $v->{reporter_code}
                },
                PathTestReqCareProfCode => {
                    extension => $v->{referrer_code}
                },
                SampleCollectionDate => {
                    extension => $v->{sample_date}
                },
                MorphologySNOMEDPathology => { # ICDO3
                    code => icdo3_to_snomed($v->{icdo3})
                },
                ServiceReportStatus => {
                    code => report_status($v->{diagnosis})
                },
                InvestigationResultDate => $v->{authorisation_date},
                PathologyReportText     => free_text_report($v),
                SampleReceiptDate       => $v->{request_date},
                ServiceReportId         => { extension => $labno },
                # morphology code if needed is 3 or 1 (from ICDO3 /3 or /1 suffix)

            # useless for haematology - refers to solid tumours:
                # 1 = primary, 4 = region LN, 5 = metastatic, 9 = not known
                # SpecimenNature => { code => 9 },

            # One of TOPOGRAPHY (SNOMED), TOPOGRAPHY (SNOMED CT) or PRIMARY
            # DIAGNOSIS (ICD PATHOLOGICAL) is required for the schema. It is
            # expected that most submissions will include TOPOGRAPHY (SNOMED).

            # PRIMARY DIAGNOSIS (ICD PATHOLOGICAL) is the PRIMARY DIAGNOSIS
            # based on the evidence from a pathological examination. Format
            # CXX.X or DXX.X
                # PrimaryDiagnosisICDPath => sample_to_snomed($v),

            # topography code (primary site of origin):
                TopographySNOMEDPathology => { code => topography_code($v) },
            },
        );
        $HAEM{HaematologicalCore} = \%h;
    }
# lab results not part of v8 schema (temporarily combined with PathologyReportText):
#    { # COSDRecord/Haematology/HaematologyContent/LaboratoryResults:
#        my %h = map +( $_ => $v->{$_} ), @lab_sections;
#        $HAEM{HaematologicalContent}{LaboratoryResults} = \%h;
#    }
    $COSDRecord{Haematological} = \%HAEM;
    return \%COSDRecord;
}

#-------------------------------------------------------------------------------
sub report_status { # 1 = Final (Complete); 2 = Preliminary (Interim)
    my $diagnosis = shift; # p $diagnosis;

    my @interim_terms = (
        'awaiting final (diagnosis|review)',
        'interim result', # NBT
    );
    return ( grep { lc $diagnosis =~ /^$_/ } @interim_terms ) ? 2 : 1;
}

#-------------------------------------------------------------------------------
# free-text specimen/biopsy_site/morphology/comment, etc:
sub free_text_report {
    my $vars = shift; # p $vars;

    my @rows;
    push @rows, ( 'SPECIMEN(S): ' . $vars->{specimen_decode} );
    if ( my $biopsy_site = $vars->{biopsy_site} ) {
        push @rows, ( 'ANATOMICAL SITE(S): ' . $biopsy_site );
    }
    push @rows, ( 'SPECIMEN QUALITY: ' . $vars->{specimen_quality} );
    if ( my $gross_desc = $vars->{gross_description} ) {
        push @rows, ( 'GROSS DESCRIPTION: ' . $gross_desc );
    }
    if ( my $details = $vars->{clinical_details} ) {
        push @rows, ( 'CLINICAL DETAILS: ' . $details );
    }
    for my $section (@lab_sections) { # results summaries:
        my $summary = $vars->{$section} || next;
        push @rows, ( join ': ', uc $section, $vars->{$section} );
    }
    if ( my $morphology = $vars->{morphology} ) {
        push @rows, ( 'MORPHOLOGY: ' . $morphology );
    }
    if ( my $comment = $vars->{comment} ) {
        push @rows, ( 'COMMENT: ' . $comment );
    }
    push @rows, ( 'DIAGNOSIS: ' . $vars->{diagnosis} );
    push @rows, (
        'REPORTED BY: ' . $vars->{reporter_name}
        . ' on ' . $vars->{report_datetime}
    );
    my $str = join "\n", @rows;
    # enclose text in CDATA tags:
    return as_cdata($str);
}

#-------------------------------------------------------------------------------
# merge vertical table data with $vars
sub combine_data {
    my ($vars, $supplimentary_data) = @_; # p $supplimentary_data;

    { # add data-map fields:
        # result summaries (enclose text in CDATA tags):
        for (@lab_sections) {
            my $result = $supplimentary_data->{result_summary}->{$_} || next; # can't be '0'
            # temporarily combined with PathologyReportText so doesn't need to be CDATA:
            $vars->{$_} = $result; # as_cdata($result);
        }
        # reporter:
        $vars->{$_} = $supplimentary_data->{reporter_data}->{$_},
            for qw(reporter_name reporter_code report_datetime);
        # specimen code(s) & decode(s):
        $vars->{specimen_code}
            = $supplimentary_data->{specimen}->{specimen_code};
        $vars->{specimen_decode}
            = $supplimentary_data->{specimen}->{description};
    } # p $vars;

    $vars->{gender} ||= 'U'; # p $vars;
    return $vars;
}

#-------------------------------------------------------------------------------
sub topography_code {
    my $vars = shift;
    # C80.9 / Unknown primary site:
    return 'C80.9' unless $vars->{icdo3};

    my ($icdo3) = $vars->{icdo3} =~ /^(\d{4})/; # capture 1st 4 digits;

    # plasmacytomas? CLL? EMZL? - check the MZL & MYD88-mutated entries!!

    my %topo_code = (
        8720 => 'C44._', # metastatic melanoma
        9689 => 'C42.2', # splenic MZL
        9699 => 'C80.9', # extranodal MZL (Rule D)
        9700 => 'C44._', # MF
        9709 => 'C44._', # cuteneous TCL
        9718 => 'C44._', # primary cuteneous CD30+ LPD
        # 9731 => 'C40._', # plasmacytoma (only of bone; C40 is bones, joints, etc)
        9732 => 'C42.1', # myeloma
        9733 => 'C42.1', # plasma-cell leukaemia
    );
    return $topo_code{$icdo3} if $topo_code{$icdo3};

    # primary non-haematological tumour/metastatic carcinoma:
    return 'C80.9' if in_num($icdo3, 8000, 8010);
    # rhabdomyosarcoma/pnet/neuroblastoma:
    return 'C80.9' if in_num($icdo3, 8900, 9260, 9500);
    # mastocytosis, histiocytosis, langerhans:
    return 'C80.9' if between($icdo3, 9741 => 9755);
    # lymphomatoid granulomatosis, amyloidosis:
    return 'C80.9' if between($icdo3, 9766 => 9769);
    # Rule E: all leukaemias (9801/3 .. 9989/3):
    return 'C42.1' if between($icdo3, 9801 => 9990);
    # Rule D: default for lymphomas (9591/3 .. 9739/3)
    return 'C77._' if between($icdo3, 9591 => 9739);
    # escaped all above returns so output for debugging:
    p $vars->{diagnosis};
}

#-------------------------------------------------------------------------------
sub between { # adapated from Acme::Tools
  my ($test, $lower, $upper) = @_;

  return $lower < $upper
    ? $test >= $lower && $test <= $upper
    : $test >= $upper && $test <= $lower;
}

#-------------------------------------------------------------------------------
sub in_num { # lifted from Acme::Tools
  my $val = shift || return 0; # shift 1st val, remaining in @_
  for (@_) { return 1 if $_ == $val }
  return 0;
}

#-------------------------------------------------------------------------------
sub icdo3_to_snomed { # eg 9960/3 -> M99603
    local $_ = shift || return undef; # p $icdo3;
    s{/}{}; # remove slash
    return 'M' . $_;
}

#-------------------------------------------------------------------------------
sub gender_code {
    my $gender = shift;
    return 1 if lc $gender eq 'm';
    return 2 if lc $gender eq 'f';
    return 9; # x = not known, 9 = not specified
}

#-------------------------------------------------------------------------------
# first + optional middle names
sub given_name {
    my $vars = shift;
    return join ' ', grep $_, @{$vars}{ qw/first_name middle_name/ };
}

#-------------------------------------------------------------------------------
sub xs_options {
    my %opts = (
        SuppressEmpty => $suppress_empty,
        RootName      => 'COSD:COSD',
        NoEscape      => 1, # to preserve <!CDATA[..]]> tags
        XMLDecl       => q!<?xml version="1.0" encoding="UTF-8"?>!,
        KeyAttr       => [],
        NoAttr        => 1, # gives inline (scalar) AND nested (arrayref) attributes
    );
    return %opts;
}

#-------------------------------------------------------------------------------
# scp function broken, replaced by email 07/2016
sub transfer_file { # only called if not $JUST_TESTING
    my $args = shift;

    my $cfg = $settings->{nycris_server};

    my %params = (
        local_filename => $args->{local_filename},
        server_addr    => $cfg->{remote_address},
        username       => $cfg->{username},
        password       => $cfg->{password},
    ); # p %params;

    # scp file (returns str on failure, undef on success):
    # my $rtn = $tools->scp_file(\%params); # broken
    my $rtn = email_file(); # 05/07/2016

    my $script = $tools->script_filename;

    if ($rtn) {
        $tools->mail_admin({ script => $0, msg => $rtn });
        warn "$script - $rtn\n"; # dump to logs but don't die!!
    }
    else {
        archive_and_delete_file($args->{local_filename});

        my $month = $ref_date->month_name; # for cron log:
        print "$script reports data for $month succesfully uploaded\n";
    }
}

#-------------------------------------------------------------------------------
sub email_file { # email a password-protected file:
    ( my $zip_filename = $data_filename ) =~ s/txt$/zip/; # warn $zip_filename;
    ( my $subject      = $data_filename ) =~ s/\.txt$//;  # warn $subject;

    my @cmd = ('/usr/bin/zip');
    push @cmd, '-q'; # quiet
	push @cmd, '-j'; # junk dir name
    push @cmd, '--password';
    push @cmd, ucfirst $subject; # using subject title as passwd
    # --------------------------------------------------------------------------
	push @cmd, '-'; # only needed to allow piping of content if using capture()
    # --------------------------------------------------------------------------
    push @cmd, TMP_DIR . '/' . $zip_filename;
    push @cmd, TMP_DIR . '/' . $data_filename; # p @cmd;

=begin # method to save zip file to disk & then use io() for $content:
	# system(@cmd);
	# my $content = io->file(TMP_DIR . '/' . $zip_filename)->all;
=cut
	# use IPC::System::Simple to cature zip output:
	my $content = capture(@cmd);
    my %mail = (
        config  => $config,
        subject => $subject,
        filename   => $zip_filename,
        attachment => $content,
    ); # p %mail; # p @recipients;
    my $ok = $tools->send_mail(\%mail, [ 'nycris.secure' ]); # returns 0 if msg fails
    # for compatibility with scp_file():
    return 'message(s) failed to send' if not $ok;
	return 0; # don't return true unless msg fails
}

#-------------------------------------------------------------------------------
# archive data file (only called if not $JUST_TESTING):
sub archive_and_delete_file {
    my $src_file = shift; # p $src_file;

	my $path_to_archive = $settings->{nycris_server}->{path_to_archive};

    # make sure it exists otherwise get error trying to tar non-existant file
    if (-e $src_file ) { # warn 'it exists';
        my $tar_file = sprintf '%s/%s.tar',
            $path_to_archive, $data_filename; # p $tar_file;

		chdir TMP_DIR; # so we can use relative path for source directory

		system( sprintf 'tar -cf %s %s', $tar_file, $data_filename ); # using relative $data_file.* not full path
		system( sprintf "gzip $tar_file" ); # compress tar file
        # delete source file:
        io($src_file)->unlink;
    }
}

#-------------------------------------------------------------------------------
sub get_data_map {
    my $request_ids = shift; # p $request_ids;

    my $map = {};

    { # request_specimens:
        my $result = do {
            my ($sql, @bind) = _request_specimens($request_ids);
            $dbix->query($sql, @bind);
        };
        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id = $vars->{request_id};

            my %data = (
                specimen_code => $vars->{code},
                description   => $vars->{description},
            ); # p %data;
            $map->{$request_id}->{specimen} = \%data;
        }
    }
    { # reporter data:
        my $result = do {
            my ($sql, @bind) = _reporters($request_ids);
            $dbix->query($sql, @bind);
        };
        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id = $vars->{request_id};

            my $reporter = join ' ',
                ucfirst $vars->{first_name},
                ucfirst $vars->{last_name};

            my $reporter_code = $vars->{registration_number} || 'H9999998'; # OTHER HEALTH CARE PROFESSIONAL;

            my %data = (
                reporter_name   => $reporter,
                reporter_code   => $reporter_code,
                report_datetime => $vars->{report_datetime},
            );
            $map->{$request_id}->{reporter_data} = \%data;
        }
    }
    { # result summaries:
        my $result = do {
            my ($sql, @bind) = _result_summaries($request_ids);
            $dbix->query($sql, @bind);
        };
        while ( my $vars = $result->hash ) { # p $vars;
            my $request_id  = $vars->{request_id};
            my $section     = to_camel_case($vars->{section_name});
            my $result      = fix_latin($vars->{results_summary});
            $map->{$request_id}->{result_summary}->{$section} = $result;
        } # p $map->{result_summary};
    } # p $map;

    return $map;
}

#-------------------------------------------------------------------------------
sub tidy { # not in use, replaced by as_cdata()
    my $str = shift; # p $str;

    # trim:
    $str =~ s/\A\s+//; # leading
    $str =~ s/\s+\Z//; # trailing
    #$str =~ s/\r?\n/$new_line_marker/g; # captures both \r\n & \n
    $str =~ s/\N{CARRIAGE RETURN}?\N{LINE FEED}/$new_line_marker/g;
}

#-------------------------------------------------------------------------------
# need to use XML::Simple opt NoEscape => 1 to preserve CDATA tags:
sub as_cdata {
    my $str = shift;
    # 1) for XML::Simple
    # return "<![CDATA[\n$_[0]\n]]>";
    # 2) for XML::LibXML
    my $cdata = XML::LibXML::CDATASection->new($str); # p $cdata->textContent;
    return $cdata;
}

#-------------------------------------------------------------------------------
sub to_camel_case {
    my $ref = shift; # p $ref;
    $ref =~ s/_/ /g; # convert underscores to spaces for camelCasing
    # now temporarily combined with PathologyReportText so don't need this yet:
    # $ref = join '', map { ucfirst $_ } split '\W', $ref; # p $ref;
    return $ref;
}

#-------------------------------------------------------------------------------
sub namespace_data { # TODO: confirm uri's
    return <<EOF
    xmlns:COSD="http://www.datadictionary.nhs.uk/messages/COSDPathology-v8-1"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
EOF
#    xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/COSDPathology-v8-1
#    COSDCOSDXMLSchema-v8-1.xsd"
}

#-------------------------------------------------------------------------------
sub new_guid {
    my $guid = guid_string();
#   $all_guids{$guid}++; # for debugging - to ensure uniqueness
    return $guid;
}

# SQLA -------------------------------------------------------------------------
sub _ncrs_data {
    my $report_status = $ncrs_cfg->{report_status};

    my @cols = ( qw/
            DISTINCT(r.id)|request_id
            DATE(r.created_at)|request_date
            r.request_number
            r.year
            p.last_name
            p.first_name
            p.middle_name
            p.nhs_number
            pc.unit_number
            p.dob
            p.gender
            pd.address
            pd.post_code
            ref.national_code|referrer_code
            rs.organisation_code
            rr.biopsy_site
            rr.gross_description
            rr.specimen_quality
            DATE(rr.specimen_date)|sample_date
            rr.clinical_details
            rr.morphology
            rr.comment
            d.name|diagnosis
            d.icdo3
            DATE(rh.time)|authorisation_date
    / );
    my @rels = (
        'requests|r'                    => q{r.patient_case_id=pc.id}          ,
        'patient_case|pc'               => q{pc.patient_id=p.id}               ,
        'patients|p'                    => q{r.referrer_department_id=rd.id}   ,
        'referrer_department|rd'        => q{rd.referrer_id=ref.id}            ,
        'referrers|ref'                 => q{pc.referral_source_id=rs.id}      ,
        'referral_sources|rs'           => q{rh.request_id=r.id}               ,
        'request_history|rh'            => q{rr.request_id=r.id}               ,
        'request_report_view|rr'        => q{rr.diagnosis_id=d.id}             ,
        'diagnoses|d'                   => q{ris.request_id=r.id}              ,
        'request_initial_screen|ris'    => q{ris.screen_id=s.id}               ,
        'screens|s'                     => q{=>rdh.request_id=r.id}            ,
        'request_diagnosis_history|rdh' => q{=>pd.patient_id=p.id}             ,
        'patient_demographics|pd'
    );
    # local_network_locations:
    push @rels, ( 'lnl.parent_id=rs.parent_organisation_id' =>
        'local_network_locations|lnl' ) if $centre eq 'leeds';
    my %where = (
        -and => [
            { # is reported/authorised & not excluded screens:
                'rh.action'      => $report_status,
                's.description'  => { -not_in => \@skip_screens },
                # 'r.id' => { -in => [353199, 353081] }, # force specific requests
            },
            -or => [ # reported/authorised or diagnosis history dates:
                {
                    'MONTH(rh.time)' => $ref_date->month,
                    'YEAR(rh.time)'  => $ref_date->year,
                },
                {
                    'MONTH(rdh.time)' => $ref_date->month,
                    'YEAR(rdh.time)'  => $ref_date->year,
                },
            ],
            -or => [ # diagnosis is not MGUS or is 'see coments'
                'd.icdo3' => { '!=' => '9765/1' },
                { 'd.name' => 'see comments', 'rr.comment' => { '!=' => undef } },
            ],
        ],
    );
    my @args = (
        -columns  => \@cols,
        -from     => [ -join => @rels ],
        -where    => \%where,
        -order_by => 'r.id',
       # -limit    => 1, # for testing
    ); # p @args;
    my ($sql, @bind) = SQL::Abstract::More->new->select(@args); # p $sql; p \@bind;
        # $dbix->dump_query($sql, @bind); exit;
    return ($sql, @bind);
}

=begin # alternative to _ncrs_data(), but can't use for NBT as they don't authorise
sub _query {
    my @dates = ($start_date, $end_date);

    my @cols = qw(
        r.id|request_id
        arv.request_number
        arv.year
        arv.reg_date|request_date
        p.last_name
        p.first_name
        p.middle_name
        p.nhs_number
        pc.unit_number
        p.dob
        p.gender
        arv.organisation_code
        pd.address
        pd.post_code
        ref.national_code|referrer_code
        rr.clinical_details
        rr.biopsy_site
        rr.gross_description
        rr.specimen_quality
        DATE(rr.specimen_date)|sample_date
        rr.morphology
        rr.comment
        arv.diagnosis
        arv.icdo3
        arv.auth_date|authorisation_date
    );
    my @rels = (
        'requests|r'                    => 'r.id=arv.id'                      ,
        'authorised_reports_view|arv'   => 'r.patient_case_id=pc.id'          , # ensures it's authorised
        'patient_case|pc'               => 'pc.patient_id=p.id'               ,
        'patients|p'                    => 'pc.referral_source_id=rs.id'      ,
        'referral_sources|rs'           => 'rs.parent_organisation_id=po.id'  ,
        'parent_organisations|po'       => 'lnl.parent_id=po.id'              ,
        'local_network_locations|lnl'   => 'r.referrer_department_id=rd.id'   ,
        'referrer_department|rd'        => 'rd.referrer_id=ref.id'            ,
        'referrers|ref'                 => 'ris.request_id=r.id'              ,
        'request_initial_screen|ris'    => 'ris.screen_id=s2.id'              ,
        'screens|s2'                    => 'rs2.request_id=r.id'              ,
        'request_specimen|rs2'          => 'rs2.specimen_id=s1.id'            ,
        'specimens|s1'                  => 'rr.request_id=r.id'               ,
        'request_report_view|rr'        => '=>pd.patient_id=pc.patient_id'    ,
        'patient_demographics|pd'       => '=>rdh.request_id=r.id'            ,
        'request_diagnosis_history|rdh'
    );
	my %where = (
        -and => [
            's2.description' => { -not_in => \@skip_screens },
            -or => {
                'DATE(rdh.time)' => { -between => \@dates },
                'arv.auth_date'  => { -between => \@dates },
            },
            -or => [
                'arv.icdo3' => { '!=' => '9765/1' }, # not MGUS
                {
                    'arv.diagnosis' => 'see comments',
                    'rr.comment'    => { -is_not => undef }
                },
            ],
        ],
	);
    my @args = (
		-columns  => \@cols,
		-from     => [ -join => @rels ],
		-where    => \%where,
        -group_by => 'r.id',
        -order_by => 'r.id',
	);
	my ($sql, @bind) = SQL::Abstract::More->new->select(@args); # p $sql; # p \@bind;
	    $dbix->dump_query($sql, @bind); exit;
    return ($sql, @bind);
}
=cut

sub _request_specimens {
    my $request_ids = shift;

    my @cols = (
        'rs.request_id',
        "GROUP_CONCAT(s.description separator '; ')|description",
        'GROUP_CONCAT(s.sample_code)|code',
    );
    my @rels = ( 'request_specimen|rs', 'rs.specimen_id=s.id', 'specimens|s' );
    my %where = ( 'rs.request_id' => { -in => $request_ids } );
    my @args = (
        -columns  => \@cols,
        -from     => [ -join => @rels ],
        -where    => \%where,
        -group_by => 'rs.request_id',
    ); # p @args;
    # need to supply own injection_guard to override build-in sql injection attack
    # prevention - detects ';' in group_concat; modified from SQL::Abstract new()
    my @args_to_new = ( injection_guard => qr/^ \s* go \s/mi );
    my ($sql, @bind) = SQL::Abstract::More->new(@args_to_new)->select(@args); # p $sql; p \@bind;
        # $dbix->dump_query($sql, @bind); exit;
    return ($sql, @bind);
}

sub _reporters {
    my $request_ids = shift;

    my @cols = qw(
        rh.request_id
        rh.time|report_datetime
        u.username
        u.first_name
        u.last_name
        ur.registration_number
    );
    my @rels = (
        'request_history|rh'    => 'rh.user_id=u.id',
        'users|u'               => '=>ur.user_id=u.id',
        'user_registration|ur'
    );
    my %where = (
        'rh.request_id' => { -in => $request_ids },
        'rh.action'     => 'reported',
    );
    my @args = (
        -columns  => \@cols,
        -from     => [ -join => @rels ],
        -where    => \%where,
    ); # p @args;
    my ($sql, @bind) = SQL::Abstract::More->new->select(@args); # p $sql; p \@bind;
        # $dbix->dump_query($sql, @bind); exit;
    return ($sql, @bind);
}

sub _result_summaries {
    my $request_ids = shift;

    my @cols = qw(
        rs.request_id
        rs.results_summary
        ls.section_name
    );
    my @rels = qw(
        request_result_summaries|rs   rs.lab_section_id=ls.id   lab_sections|ls
    );
    my %where = ( 'rs.request_id' => { -in => $request_ids } );
    my @args = (
        -columns  => \@cols,
        -from     => [ -join => @rels ],
        -where    => \%where,
    ); # p @args;
    my ($sql, @bind) = SQL::Abstract::More->new->select(@args); # p $sql; p \@bind;
        # $dbix->dump_query($sql, @bind); exit;
    return ($sql, @bind);
}

#-------------------------------------------------------------------------------
=begin # not mapping samples to snomed
# decode for biopsy_site info:
my $anatomical_site_map =
    $dbix->select('anatomical_sites', [qw(site_name snomed)])->map;
    # p $anatomical_site_map;

sub sample_to_snomed {
    my $vars = shift;

    my @snomed;
    # get info from biopsy_site if available, otherwise specimen code(s):
    if ( my $biopsy_site = $vars->{biopsy_site} ) { # optional
        my @sites = split '; ', $biopsy_site; # maybe multiple
        push @snomed, $anatomical_site_map->{$_} for @sites;
    }
    else {
        my @specimens = split ',', $vars->{specimen_code};
        push @snomed, $specimen_topography{$_} for @specimens;
    } # p @snomed;

    # legacy biopsy site that doesn't match anatomical_sites.site_name:
    return { code => 'C80.9' } unless @snomed; # site unknown

    return scalar @snomed == 1
        ? { code => $snomed[0] }
        : [ map { code => $_ }, @snomed ];
}

# for non-histology specimens (sample_code NOT RLIKE '[DGLRX]([BS]L|F|U)'):
# NB these are ICDO3 topography codes for site of tumour origin
my %specimen_topography = (
    BMA     => 'C42.1',
    BMAT    => 'C42.1',
    CF      => '',
    CHIA    => 'C42.0', # should never issue new diagnosis on CHIA
    CHIB    => 'C42.0', # should never issue new diagnosis on CHIB
    CHIM    => 'C42.0', # should never issue new diagnosis on CHIM
    CMP     => '',      # should never issue new diagnosis on CMP
    EF      => '',
#   HS      => '', # require biopsy_site now
#   LA      => 'C77.9', # require biopsy_site now
    PB      => 'C42.0',
    SE      => '', # never issue new diagnosis on SE
    TBL     => 'C42.1',
    TBP     => 'C42.1',
    TSL     => 'C42.1',
    URI     => '', # never issue new diagnosis on URI
#   XA      => '', # require biopsy_site now
    RXPB    => 'C42.0', # should never issue new diagnosis on RXPB
    TSM     => '', # never issue new diagnosis on TSM
);
=cut