#!/usr/bin/perl
=begin
--------------------------------------------------------------------------------
* generates XML data file for genomics data, ftp transfer, archive as gzip
* also transfers consent forms (or any file in request dir) with ctime within MAX_AGE
* run from cron - set duration in arg eg "genomics_xml.pl -t 300"
* on test server only: run as root/www-data or directory read permissions denied
* set ppm_server -> remote_addr -> localhost in settings.cfg to auto-set $JUST_TESTING
* validated using XML::Validator::Schema
only supports 'unqualified' attributeFormDefault and elementFormDefault
does not recognise xs:long for nhs_number - use xs:positiveInteger instead
* on ftp file transfer failure, xml/pdf files retained in /tmp, also xml files
in /backups/genomics, also can run script manually passing -t seconds from
1st ftp transfer failure
--------------------------------------------------------------------------------
=cut
BEGIN {
use constant MAX_AGE => 3600; # default if no value passed as -t
use Getopt::Std;
getopts('t:d:q'); # time (seconds), database (set by .t), sql trace
our($opt_t,$opt_d,$opt_q);
$ENV{SQL_TRACE} = $opt_q;
} # warn $opt_d; warn $opt_t; exit;
#===============================================================================
my $duration = $opt_t || MAX_AGE; # warn $duration; # seconds
my $database = $opt_d || 'genomics'; # warn $database;
my $JUST_TESTING = 0; # skips file transfer (is auto-set to 1 for dev)
my @recipients = qw(
denise.hancock3@nhs.net
tamar.kammin@sth.nhs.uk
bridgetmorgan@nhs.net
raj
); # christopher.knowles@nhs.net
my $xsd = 'genomics.xsd'; # current xsd - synlinked to most recent file
#===============================================================================
use lib (
'/home/raj/perl5/lib/perl5',
'/home/raj/apps/HILIS4/lib',
'/home/raj/perl-lib', # Local::XMLSimple - patched to escape single-apostrophe
);
use IO::Compress::Gzip qw(gzip $GzipError);
use LIMS::Local::GenomicsValidation;
use LIMS::Local::ScriptHelpers;
use Local::XMLSimple qw(:strict); # need to specify ForceArray & KeyAttr if using :strict here
use Data::Printer alias => 'p', use_prototypes => 0;
use FindBin qw($Bin); # warn $Bin; exit;
use File::Basename;
use Data::Dumper;
use Modern::Perl;
use Local::Utils;
use Path::Tiny;
use Local::DB;
use IO::All;
my $cfg_file = $Bin . '/../lib/settings.cfg'; # p $cfg_file;
my $xsd_src = $Bin . '/../../../setup/schema/xsd/' . $xsd; # p $xsd_src;
my $settings = Config::Auto::parse($cfg_file); # p $settings;
my $ppm_settings = $settings->{ppm_server}; # p $ppm_settings;
my $genomics_cfg = $settings->{genomics}; # p $genomics_cfg;
my $genomics_uat = $genomics_cfg->{uat}; # p $genomics_uat; exit;
#===============================================================================
# override $JUST_TESTING if test script or dev box:
$JUST_TESTING ||= ( $ENV{HARNESS_ACTIVE} ||
$ppm_settings->{remote_address} eq 'localhost' ); # warn $JUST_TESTING;
#===============================================================================
# set some package vars (auto-stringify datetime objects & don't log queries):
$Local::QueryLogger::NO_QUERY_LOGS = 1; # don't need queries in logs dir
$Local::DBIx::Simple::Result::STRINGIFY_DATETIME = 1; # as we hand $data straight to XMLout()
# recent XML::Simple version (2.21) outputs uninitialized warnings on undef fields:
# no warnings "XML::Simple"; # kills hmds_test - need to delete undefs in data blocks
# get tools from LIMS::Local::ScriptHelpers:
my $tools = LIMS::Local::ScriptHelpers->new();
my $dbix = Local::DB->dbix({ dbname => $database });
my $sql_lib = $tools->sql_lib();
my $config = $tools->config(); # for email validation failure
my $validator = LIMS::Local::GenomicsValidation
->new(dbix => $dbix, cfg => $settings, xsd_src => $xsd_src);
my %SQL = (
demographics => $sql_lib->retr('genomics_demographics'),
# unit_number => $sql_lib->retr('genomics_unit_number'), # value loaded in demograhics
request_ids => $sql_lib->retr('genomics_requests'),
storage => $sql_lib->retr('genomics_storage'),
results => $sql_lib->retr('genomics_results'),
consent => $sql_lib->retr('genomics_consent'),
);
# get request ids:
my @request_ids = do {
my ($sql, @bind) = _get_query_for_request_ids(); # p [$sql, @bind];
$dbix->query( $sql, @bind )->column; # new Local::DBIx::Simple::Result method
}; # p \@request_ids; exit;
#===============================================================================
exit unless @request_ids;
#===============================================================================
my $timestamp = LIMS::Local::Utils::time_now();
use constant TMP_DIR => '/tmp'; # where to create temp data file
use constant LOGFILE => '/home/raj/crons/genomics_transfer.log';
my @lab_sections = do {
my @args = ( 'lab_sections', ['section_name'], { is_active => 'yes' } );
$dbix->select(@args)->column; # array
}; # p \@lab_sections;
# only need this if using xml element names for data feed:
# my $test_element_map = get_lab_test_to_xml_element_map(); # p $test_element_map; exit;
{ # go:
# get requests 1-to-1 data:
my @requests = $dbix->query( $SQL{demographics}, @request_ids )->hashes;
# p @requests;
REQUEST:
for my $req (@requests) { # p $req; # get results data:
my $request_id = $req->{request_id};
# allocate GeL/participant id if screened and not already done:
if ( my $arm = $req->{arm} ) { # if screened
my $gel_id = $genomics_uat->{rare_disease} + $request_id; # 122_000_000 + req_id
# add increment if screened as cancer (ie to make 223_000_000 + req_id)
$gel_id += $genomics_uat->{cancer_incr} if $arm =~ /cancer/i; # p $gel_id;
# update unit_number -> participant_id if not already:
unless ( _matches($req->{participant_id}, $gel_id) ) {
$req->{participant_id} = $gel_id;
update_participant_id($req);
}
}
else { # will be invalid xml entry 'UNKNOWN' so delete:
delete $req->{participant_id};
}
my %req_fields = map +($_ => $req->{$_}), # skip unrequired fields:
grep { $_ !~ /\b(year|request_number|patient_case_id)\b/ }
# skip undef vals to avoid uninitialized val warns from XML::Simple
grep defined $req->{$_}, keys %$req; # p \%req_fields;
# remove trailing 'cancer' from disease_type_registration:
$req_fields{disease_type_registration} =~ s/\scancer\Z//i
if $req_fields{disease_type_registration}; # only exists for cancer
# lab number:
$req_fields{lab_number} = join '/',
$req->{request_number}, $req->{year} - 2000; # p \%req_fields;
# build data structure for this request:
my %h = ( Demographics => \%req_fields ); # p \%h;
SECTION:
for my $section_name ( @lab_sections ) { # p $section_name;
# skip unless $section_name in $test_element_map keys:
# my $section_element_map = $test_element_map->{uc($section_name)}
# or next SECTION; # p $section_element_map;
my @data = do {
my @bind = ( $request_id, $section_name );
$dbix->query( $SQL{results}, @bind )->hashes;
}; # p \@data;
# remove any non-word chars from section name -> CamelCase:
my $section_element_name = join '', map ucfirst $_, grep $_ !~ /\d/,
split '\W', $section_name; # p $section_element_name;
RESULT: # each test/result pair for this section (query ensures no undef vals)
for my $ref (@data) { # hashref of keys: test_name & result
my $test_name = $ref->{test_name};
=begin # this is only needed to map test_names to GeL xml element names:
# get XML element name or skip test (not required in XML data):
my $test_element_name = $section_element_map->{$test_name}
or next RESULT; # p $element_name;
# add element name and result to section data:
$h{$section_element_name}{$test_element_name} = $ref->{result};
=cut
# if NOT using GeL xml element names:
$h{$section_element_name}{$test_name} = $ref->{result};
}
}
# fluidx storage (1-to-many with request_id):
if ( my @data = $dbix->query( $SQL{storage}, $request_id )->hashes ) {
# ensure no undef values or XML::Simple v2.21+ issues uninitialized value warnings:
remove_undef_values($_) for @data;
$h{Storage}{Vials}{vial} = \@data; # p \@data;
}
{ # consent taken & withdrawn - from request_lab_test_history file:
my $data = $dbix->query( $SQL{consent}, $request_id )
->map_hashes('action');
if ( my $event = $data->{consent_given} ) {
my @user = @{$event}{ qw/first_name last_name/ };
$h{Consent}{consent_taken_by}
= join ' ', map ucfirst $_, @user;
}
if ( my $event = $data->{consent_withdrawn} ) { # infrequently used:
my @user = @{$event}{ qw/first_name last_name/ };
$h{ConsentWithdrawal}{withdrawal_taken_by}
= join ' ', map ucfirst $_, @user;
}
} # p \%h;
{ # consent form filename (need to allow for >1 file):
my $filepath = get_destination_sub_dir($req); # p $filepath;
if ( -e $filepath ) {
my @contents = io($filepath)->all;
FILE:
for my $f (@contents) { # p $f;
next FILE unless $f->type eq 'file';
my $filename = join '~', $request_id, $f->filename;
# add as array in case >1 file in dir
push @{ $h{Consent}{Files}{file} }, $filename;
my $file_age = ( $timestamp->epoch - $f->ctime ); # p $file_age; # in seconds
next FILE if $file_age > MAX_AGE; # p [ $f->filename, $age ];
{ # file < MAX_AGE so transfer it (returns 0 if JUST_TESTING):
my %args = (
local_filename => $f->name,
remote_filename => $filename,
); # p \%args;
ftp_file(\%args);
}
}
}
} # p \%h;
# validate params for patterns that cannot be done by XML validation libs
# in validate_xml_against_xsd():
if ( my $errs = $validator->validate_params(\%h) ) { # p $errs; # arrayref
unless ($JUST_TESTING) {
notify_admin( { data => \%req_fields, error => $_ } ) for @$errs;
}
else {
say $req_fields{participant_id}, ' ', $_ for @$errs;
}
next REQUEST unless $ENV{HARNESS_ACTIVE}; # need file to be generated
}
{ # create and validate xml file:
my $content = to_xml(\%h); # say $content;
# validate xml, returns XML::SAX::Exception object on failure, undef on success:
if ( my $error = $validator->validate_xml_against_xsd($content) ) { # warn $error;
unless ($JUST_TESTING) {
my %args = ( data => \%req_fields, error => $error );
notify_admin(\%args);
next REQUEST;
}
else { # need the xml file for genomics.t or tests fail
say $req_fields{participant_id}, ' ', $error;
}
}
my $local_name = $ENV{HARNESS_ACTIVE}
? "${database}.xml"
: sprintf '%s_%s.xml',
$request_id, $timestamp->strftime('%Y_%m%d_%H%M%S'); # p $local_name;
my $full_path = join '/', TMP_DIR, $local_name; # p $full_path;
$content > io($full_path);
if ( $ENV{HARNESS_ACTIVE} ) { # dump %h to file and exit loop:
io(join '/', TMP_DIR, "${database}.txt")->print(Dumper \%h);
}
{ # transfer file (returns 0 if JUST_TESTING):
my %args = (
local_filename => $full_path,
remote_filename => $local_name,
ascii_mode => 1, # override default binary mode
);
ftp_file(\%args);
}
#===================================================================
next REQUEST if $JUST_TESTING; # don't archive - .xml retained in /tmp
#===================================================================
{ # archive in genomics_archive dir:
my $archive_dir = $ppm_settings->{genomics_archive}
or die 'no archive dir specified in settings.cfg';
my $archive = join '/', $archive_dir, $local_name;
my $z = new IO::Compress::Gzip($archive . '.gz') or die $!;
$z->write($content);
$z->close();
}
}
}
}
#-------------------------------------------------------------------------------
sub ftp_file {
my $args = shift;
my $remote_filename = $args->{remote_filename};
my $local_filename = $args->{local_filename}; # p $local_filename;
my $ascii_mode = $args->{ascii_mode} || 0; # optional - only for xml files
my %params = (
local_filename => $local_filename,
remote_filename => $remote_filename,
server_addr => $ppm_settings->{remote_address},
username => $ppm_settings->{username},
password => $ppm_settings->{password},
cwd => 'HILIS_GEL', # destination_dir for genomics data
passive_mode => 1,
ascii_mode => $ascii_mode, # override default binary mode in ScriptHelpers::ftp_file()
); # p \%params;
#===============================================================================
return if $JUST_TESTING; # say "about to transfer $remote_filename";
#===============================================================================
{
# ftp file (returns str on failure, undef on success):
my $rtn = $tools->ftp_file(\%params); # p $rtn;
if ($rtn) { # p $rtn;
my $script = File::Basename::basename($0);
my $time = $timestamp->strftime('%Y-%m-%d %T');
my $msg = join ': ', $remote_filename, $rtn;
warn "$script [$time] $msg"; # dump to logs but don't die!! - no smtp on test:
$tools->mail_admin({ script => $script, msg => $msg });
my $logfile_msg = format_msg($rtn);
io(LOGFILE)->append($logfile_msg);
}
else {
my $msg = format_msg($local_filename);
io(LOGFILE)->append($msg);
# only delete xml file, not consent forms!!
my $tmp_dir = TMP_DIR; # p $tmp_dir; # TMP_DIR doesn't work used direct in regex
if ( $local_filename =~ m!^$tmp_dir/! ) { # say "deleting $local_filename";
io($local_filename)->unlink;
}
}
}
}
#-------------------------------------------------------------------------------
sub to_xml {
my $data = shift; # arrayref of request datasets
# remove leading/trailing white-space (eg introduced by pasting from excel):
_trim($data);
# ensure data conforms to XSD:
# _validate_for_xsd($data); # replaced with validate_xml_against_xsd() so vals get fixed
# options for XMLout (needs array or hash):
my %xs_opts = xs_options(); # p %xs_opts;
my $xs = Local::XMLSimple->new(%xs_opts);
my $ref = {
FileCreationDateTime => $timestamp->datetime, # require string
# add direct if NOT using xml element names, otherwise use format_request():
# Record => format_request($data),
Record => $data,
}; # p $ref;
# stringify datetime objects (as XML::Simple can't):
# transform_datetime_to_string($ref); using $Local::DBIx::Simple::Result::STRINGIFY_DATETIME
# enclose xml in outer <add> block; add 'version' inline:
# my $input = { add => $ref, version => 2 };
my $xml_out = $xs->XMLout($ref); # p $xml_out;
return $xml_out;
}
#-------------------------------------------------------------------------------
sub notify_admin {
my $args = shift; # href of data & error
my $error = $args->{error}; # p $error; # stringifies as scalar
my $data = $args->{data}; # p $data; # href
# generate file id, either participant_id (if screened) or lab no:
my $file_id = $data->{participant_id} || $data->{lab_number};
my $message = join ': ', $file_id, $error;
# send email to admin:
my %mail = (
config => $config,
subject => 'Genomics xml validation failure',
message => $message,
); # p \%mail; p \@recipients;
$tools->send_mail(\%mail, \@recipients);
}
#-------------------------------------------------------------------------------
sub xs_options {
my %opts = (
SuppressEmpty => 0, # ? only works for xml_in
NoEscape => 0, # should be default anyway, but doesn't work
RootName => 'HILIS4Genomics',
XMLDecl => q!<?xml version="1.0" encoding="UTF-8"?>!,
KeyAttr => [],
NoAttr => 1, # gives inline (scalar) AND nested (arrayref) attributes
);
return %opts;
}
#-------------------------------------------------------------------------------
sub get_destination_sub_dir {
my $ref = shift; # p $ref;
my ($request_num, $year) = ($ref->{request_number}, $ref->{year}); # p [$year, $request_num];
my $i = int ( $request_num / 100 ); # warn $i; # 1-99 = 0, 100-199 = 1, 200-299 = 2, etc
my $mini_dir = sprintf '%s-%s',
100 * $i || 1, # default to 1 if 0; 1, 100, 200, 300, etc
100 * $i + 99; # 99, 199, 299, etc
my $app_dir = path($tools->path_to_app_root)->realpath; # p $app_dir;
my $rel_path = 'static/files/genomics';
my $destination_dir = join '/', # eg /path/to/uploads/2013
$app_dir,
$rel_path,
$year,
$mini_dir,
$request_num; # p $destination_dir;
return $destination_dir;
}
sub remove_undef_values { # based on kaitlyn's transform_datetime_to_string() below:
my $r = shift; # p $r;
return unless ref $r eq 'HASH';
for my $key( keys %$r ) { # p [$key, $r->{$key}];
if ( ref $r->{$key} eq 'HASH' ) { # recursive call for hashrefs
remove_undef_values( $r->{$key} );
}
defined $r->{$_} || delete $r->{$_} for keys %$r; # p $r;
}
}
sub update_participant_id {
my $req = shift; # p $req;
my $patient_case_id = $req->{patient_case_id}; # p $patient_case_id;
my $participant_id = $req->{participant_id}; # p $participant_id;
my $request_id = $req->{request_id};
my $result = $dbix->update('patient_case',
{ unit_number => $participant_id }, { id => $patient_case_id });
if ($result->rows) { # patient_case updated:
my $user_id =
$dbix->select('users', ['id'], { username => 'crontab' })->value;
my %h = (
request_id => $request_id,
user_id => $user_id,
action => 'auto-set participant ID',
);
$dbix->insert('request_history', \%h);
}
else {
warn sprintf "participant id %s failed to update on request id %s",
$participant_id, $request_id;
}
}
sub format_msg {
my $str = shift;
my $app_ver = get_app_version();
my $msg = sprintf "%s [%s]: %s\n",
$timestamp->strftime('%Y-%m-%d %T'), $app_ver, $str;
return $msg;
}
sub get_app_version {
chdir '/home/raj/apps/HILIS4';
chomp( my $raw_time = `/usr/bin/git log -1 --format=%cd --date=raw` ); # warn $raw_time; # cmd appends new-line
# extract epoch seconds eg 1455628027 +0000 [GMT], 1460022637 +0100 [BST], etc:
my ($epoch) = $raw_time =~ /^(\d+)\s\+0[01]00/ or # don't use '||' here - gets truth
die "could'nt extract epoch time from `git log` output"; # warn $epoch;
my $vnumber = 999 + `/usr/bin/git rev-list HEAD --count`; # as 1st in git repo = svn #1000
return sprintf '%.4f', 4 + ( $vnumber / 10000 ); # ensure 4 digits
}
sub _get_query_for_request_ids {
my $sql = $SQL{request_ids};
# count number of placeholders in $sql:
my $n = () = $sql =~ /\?/g; # p $n;
# bind one $duration per placeholder:
my @bind = map $duration, (1..$n); # p \@bind;
return ($sql, @bind);
}
sub _matches { Local::Utils::matches(@_) }
sub _trim {
my $data = shift; # p $data;
KEY:
for my $key ( keys %$data ) { # say "$key: $data->{$key}"; # p $data->{$key};
if ( ref($data->{$key}) eq 'HASH' ) { # recursive call for hashrefs
# warn $data->{$key};
_trim( $data->{$key} ); # p $r->{$key};
next KEY; # not sure why this is necessary ??
} # say "$key: $data->{$key}";
# remove leading/trailing space chars (incl. unicode 160) from string:
$data->{$key} =~ s/^\p{Space}//g;
$data->{$key} =~ s/\p{Space}$//g; # p $data->{$key};
} # p $data;
return 0; # no need - modifying hashref in situ
}
=begin # unused methods:
# map of lab-section => test name => XML element name
sub get_lab_test_to_xml_element_map { # only needed if using xml element names
my @data = <DATA>; # p @data;
my %map;
for (@data) {
chomp; # say $_;
my ($section, $test_name, $element_name) = split ':', $_;
# p [$section, $test_name, $element_name];
next unless $section;
$map{$section}{$test_name} = $element_name;
} # p \%map;
return \%map;
}
# formats repeating request blocks (only needed if using xml element names):
sub format_request {
my $r = shift; # p $r;
my %data;
{ # registration/demographics:
my $ref = $r->{Demographics};
my %h = (
nhs_number => $ref->{nhs_number},
date_of_birth => $ref->{dob},
last_name => $ref->{last_name},
first_name => $ref->{first_name},
gender => $ref->{gender},
);
$data{Registration} = \%h;
} # p \%data;
return \%data;
}
# for use if date(time) value is an object:
sub transform_datetime_to_string {
my $r = shift;
return unless ref $r eq 'HASH';
for my $key ( keys %$r ) {
if ( ref $r->{$key} eq 'HASH' ) { # recursive call for hashrefs
transform_datetime_to_string( $r->{$key} );
}
if ( ref $r->{$key} =~ /^DateTime/ ) {
$r->{$key} = sprintf "%s", $r->{$key}; # quote to force to string
}
}
}
=cut
__DATA__
CONSENT:consent_date:date-of-consent
CONSENT:consent_form_version:name-and-version-of-consent-form
CONSENT:consent_taken:consent-given
CONSENT:info_sheet_version:name-and-version-of-participant-information-sheet
CONSENT:consent_q1:consent-question-1
CONSENT:consent_q2:consent-question-2
CONSENT:consent_q3:consent-question-3
CONSENT:consent_q4:consent-question-4
CONSENT:consent_q5:consent-question-5
CONSENT:consent_q6:consent-question-6
CONSENT:consent_q7:consent-question-7
CONSENT:consent_q8:consent-question-8
CONSENT:consent_q9:consent-question-9
CONSENT:consent_q10:consent-question-10
CONSENT:consent_q11:consent-question-11
CONSENT:consent_q12:consent-question-12
CONSENT:consent_q13:consent-question-13
CONSENT:consent_q14:consent-question-14
SPECIMENS:edta1:DNA Blood Germline
SPECIMENS:pst:LiHep Plasma
SPECIMENS:paxgene_rna:RNA Blood
SPECIMENS:sst:Serum
SPECIMENS:handling_protocol:Laboratory Method
STORAGE:vial_id:Laboratory Sample ID
STORAGE:rack_id:GMC Rack ID
STORAGE:rack_location:GMC Rack Well
DNA:edta1_qc_date:Test Result DateTime
DNA:edta1_qc_type:Test Result Type
DNA:edta1_qc_result:Test Result Value
DNA:edta1_fluidx:Laboratory Sample ID
DNA:edta1_fluidx_vol:Laboratory Sample Volume
DNA:edta1_fluidx_rack_id:GMC Rack ID
DNA:edta1_fluidx_rack_well:GMC Rack Well
DNA:edta1_fluidx_vol:Laboratory Remaining Volume Banked
DISPATCH:consignment_number:GMC Sample Consignment Number
DISPATCH:dna_dispatched:GMC Sample Dispatch Date
DISPATCH:omics_dispatched:GMC Sample Dispatch Date