#!/usr/bin/perl
=begin
validates xml from genomics requests; uses same validation function as genomics_xml.pl
doesn't save xml file, or update any database table, or push to any remote location
can also be run from cron (requires symlink) to email results - pass -m flag as arg
don't run at midnight - takes several minutes to complete
pass -t flag with -m to dump results to file and skip email
pass -q flag to skip 'validated OK' output
=cut
use lib (
'/home/raj/perl5/lib/perl5',
'/home/raj/apps/HILIS4/lib',
'/home/raj/perl-lib', # Local::XMLSimple - patched to escape single-apostrophe
);
use Getopt::Std;
getopts('mtqd:'); # (m)essage - cron request; (d)atabase; (t)esting; (q)uiet (skip OK's)
our($opt_d, $opt_m, $opt_t, $opt_q); # warn $opt_d; exit;
#===============================================================================
my @recipients = ('raj'); # only used if -m flag passed from cron
my $JUST_TESTING = $opt_t || 0; # can't pass undef to $tools->test_only
my $cron_request = $opt_m;
my $skip_valid = $opt_q; # don't print "validated OK"
my $database = $opt_d || 'genomics'; # say $database; exit;
#===============================================================================
use Spreadsheet::WriteExcel::Simple;
use LIMS::Local::GenomicsValidation;
use LIMS::Local::ScriptHelpers;
use Local::XMLSimple qw(:strict); # need to specify ForceArray & KeyAttr if using :strict here
use Data::Printer alias => 'p', use_prototypes => 0;
use FindBin qw($Bin); # warn $Bin; exit;
use Data::Dumper;
use Modern::Perl;
use Path::Tiny;
use Local::DB;
use IO::All;
my $app_dir = $cron_request # symlink from crons/weekly dir
? path($Bin . '/../../..')->realpath
: path($Bin . '/..')->realpath; # warn $app_dir;
my $cfg_file = $app_dir . '/script/crons/lib/settings.cfg'; # p $cfg_file;
my $xsd_src = $app_dir . '/setup/schema/xsd/genomics_v3.2.xsd'; # p $xsd_src;
my $timestamp = LIMS::Local::Utils::time_now();
my $settings = Config::Auto::parse($cfg_file); # p $settings; exit;
my $tools = LIMS::Local::ScriptHelpers->new();
my $dbix = Local::DB->dbix({ dbname => $database });
my $genomics_cfg = $settings->{genomics}; # p $genomics_cfg;
my $genomics_uat = $genomics_cfg->{uat}; # p $genomics_uat; exit;
my $sql_lib = $tools->sql_lib();
my $config = $tools->config();
$tools->test_only($JUST_TESTING); # for email via cron use only
my $validator = LIMS::Local::GenomicsValidation
->new(dbix => $dbix, cfg => $settings, xsd_src => $xsd_src);
# set some package vars (auto-stringify datetime objects & don't log queries):
$Local::QueryLogger::NO_QUERY_LOGS = 1; # don't need queries in logs dir
$Local::DBIx::Simple::Result::STRINGIFY_DATETIME = 1; # as we hand $data straight to XMLout()
my %SQL = (
demographics => $sql_lib->retr('genomics_demographics'),
storage => $sql_lib->retr('genomics_storage'),
results => $sql_lib->retr('genomics_results'),
consent => $sql_lib->retr('genomics_consent'),
);
my @lab_sections = do {
my @args = ( 'lab_sections', ['section_name'], { is_active => 'yes' } );
$dbix->select(@args)->column; # array
}; # p \@lab_sections;
# get request ids:
my @request_ids = $dbix->select('requests', 'id')->column; # p \@request_ids; exit;
my @validation_errs;
{ # go:
# get requests 1-to-1 data:
my @requests = $dbix->query( $SQL{demographics}, @request_ids )->hashes;
# p \@requests; exit;
REQUEST:
for my $req (@requests) { # p $req; # get results data:
my $request_id = $req->{request_id};
# next unless grep $req->{request_number} == $_, (38, 40);
# allocate GeL/participant id if screened and not already done:
if ( $req->{participant_id} eq 'UNKNOWN' ) { # unit number still at default
if ( my $arm = $req->{arm} ) { # if screened
my $base_id = $genomics_uat->{rare_disease}; # 122_000_000
# add cancer increment if screened as cancer (ie to make 223_000_000)
$base_id += $genomics_uat->{cancer_incr} if lc $arm eq 'cancer';
$req->{participant_id} = $base_id + $request_id;
}
else {
delete $req->{participant_id}; # will be invalid xml entry 'UNKNOWN'
}
}
my %req_fields = map +($_ => $req->{$_}), # skip unrequired fields:
grep { $_ !~ /\b(year|request_number|patient_case_id)\b/ }
# skip undef vals to avoid uninitialized val warns from XML::Simple
grep defined $req->{$_}, keys %$req; # p \%req_fields;
# remove trailing 'cancer' from disease_type_registration:
$req_fields{disease_type_registration} =~ s/\scancer\Z//i
if $req_fields{disease_type_registration}; # only exists for cancer
# lab number:
$req_fields{lab_number} = join '/',
$req->{request_number}, $req->{year} - 2000; # p \%req_fields;
# build data structure for this request:
my %h = ( Demographics => \%req_fields ); # p \%h;
SECTION:
for my $section_name ( @lab_sections ) { # p $section_name;
my @data = do {
my @bind = ( $request_id, $section_name );
$dbix->query( $SQL{results}, @bind )->hashes;
}; # p \@data;
# remove any non-word chars from section name -> CamelCase:
my $section_element_name = join '', map ucfirst $_, grep $_ !~ /\d/,
split '\W', $section_name; # p $section_element_name;
RESULT: # each test/result pair for this section (query ensures no undef vals)
for my $ref (@data) { # hashref of keys: test_name & result
my $test_name = $ref->{test_name};
$h{$section_element_name}{$test_name} = $ref->{result};
}
}
# fluidx storage (1-to-many with request_id):
if ( my @data = $dbix->query( $SQL{storage}, $request_id )->hashes ) {
# ensure no undef values or XML::Simple v2.21+ issues uninitialized value warnings:
remove_undef_values($_) for @data;
$h{Storage}{Vials}{vial} = \@data; # p \@data;
}
{ # consent taken & withdrawn - from request_lab_test_history file:
my $data = $dbix->query( $SQL{consent}, $request_id )
->map_hashes('action');
if ( my $event = $data->{consent_given} ) {
my @user = @{$event}{ qw/first_name last_name/ };
$h{Consent}{consent_taken_by}
= join ' ', map ucfirst $_, @user;
}
if ( my $event = $data->{consent_withdrawn} ) { # infrequently used:
my @user = @{$event}{ qw/first_name last_name/ };
$h{ConsentWithdrawal}{withdrawal_taken_by}
= join ' ', map ucfirst $_, @user;
}
} # p \%h;
{ # consent form filename (need to allow for >1 file):
my $filepath = get_destination_sub_dir($req); # p $filepath;
if ( -e $filepath ) {
my @contents = io($filepath)->all;
FILE:
for my $f (@contents) { # p $f;
next FILE unless $f->type eq 'file';
my $filename = join '~', $request_id, $f->filename;
# add as array in case >1 file in dir
push @{ $h{Consent}{Files}{file} }, $filename;
}
}
} # p \%h;
my $lab_no = $req_fields{lab_number};
# validate params before creating xml:
if ( my $errs = $validator->validate_params(\%h) ) { # p $errs; # arrayref
for my $err(@$errs) {
my $str = $lab_no . ' ' . $err;
$cron_request
? push @validation_errs, $str
: say $str;
}
next REQUEST;
} # p \%h;
# create xml:
my $xml = to_xml(\%h); # say $xml;
# validate xml, returns XML::SAX::Exception object on failure, undef on success:
if ( my $error = $validator->validate_xml_against_xsd($xml) ) {
if ( $error =~ m!element <.*>! ) { # replace <foo> with label & section
$error = $validator->reformat_error($error);
}
my $str = join ' ', $lab_no, $error;
$cron_request
? push @validation_errs, $str
: say $str;
}
else {
next REQUEST if $cron_request || $skip_valid; # not interested in valid records
my $str = $lab_no . ' validated OK';
say $str;
}
}
}
# if cron request to email errors ($cron_request is 'true'):
if (@validation_errs) {
# my $message = join "\n", @validation_errs; p $message;
my $filename = 'genomics_validation_failures.xls';
my $subject = 'Genomics requests validation failures';
my @headers = qw(labno section field result failed reason);
my $xl = Spreadsheet::WriteExcel::Simple->new;
$xl->write_bold_row(\@headers);
for (@validation_errs) {
my ($labno, $result, $field, $section, $error) =
/^(\d+\/\d+).*\'(.*)\' for \'(.*)\' in \'(.*)\' section, (.*)/;
# which GenomicsValidation method failed it -
# validate_xml_against_xsd() XML::SAX parser or regexes in validate_params():
my ($failed) = $error =~ /required pattern/ ? 'xml' : 'data'; # vs "required definition"
my @data = ( $labno, $section, $field, $result, $failed, $error ); # p \@data;
$xl->write_row(\@data);
}
my %mail = (
config => $config,
subject => $subject,
attachment => $xl->data,
filename => $filename,
);
$JUST_TESTING
? $xl->save($Bin . '/genomics_validation.xls')
: $tools->send_mail(\%mail, \@recipients);
}
#-------------------------------------------------------------------------------
sub to_xml {
my $data = shift; # arrayref of request datasets
# remove leading/trailing white-space (eg introduced by pasting from excel):
_trim($data);
# options for XMLout (needs array or hash):
my %xs_opts = xs_options(); # p %xs_opts;
my $xs = Local::XMLSimple->new(%xs_opts);
my $ref = {
FileCreationDateTime => $timestamp->datetime, # require string
# add direct if NOT using xml element names, otherwise use format_request():
# Record => format_request($data),
Record => $data,
}; # p $ref;
# enclose xml in outer <add> block; add 'version' inline:
# my $input = { add => $ref, version => 2 };
my $xml_out = $xs->XMLout($ref); # p $xml_out;
return $xml_out;
}
#-------------------------------------------------------------------------------
sub xs_options {
my %opts = (
SuppressEmpty => 0, # ? only works for xml_in
NoEscape => 0, # should be default anyway, but doesn't work
RootName => 'HILIS4Genomics',
XMLDecl => q!<?xml version="1.0" encoding="UTF-8"?>!,
KeyAttr => [],
NoAttr => 1, # gives inline (scalar) AND nested (arrayref) attributes
);
return %opts;
}
#-------------------------------------------------------------------------------
sub get_destination_sub_dir {
my $ref = shift; # p $ref;
my ($request_num, $year) = ($ref->{request_number}, $ref->{year}); # p [$year, $request_num];
my $i = int ( $request_num / 100 ); # warn $i; # 1-99 = 0, 100-199 = 1, 200-299 = 2, etc
my $mini_dir = sprintf '%s-%s',
100 * $i || 1, # default to 1 if 0; 1, 100, 200, 300, etc
100 * $i + 99; # 99, 199, 299, etc
my $app_dir = path($tools->path_to_app_root)->realpath; # p $app_dir;
my $rel_path = 'static/files/genomics';
my $destination_dir = join '/', # eg /path/to/uploads/2013
$app_dir,
$rel_path,
$year,
$mini_dir,
$request_num; # p $destination_dir;
return $destination_dir;
}
sub remove_undef_values { # based on kaitlyn's transform_datetime_to_string() below:
my $r = shift; # p $r;
return unless ref $r eq 'HASH';
for my $key( keys %$r ) { # p [$key, $r->{$key}];
if ( ref $r->{$key} eq 'HASH' ) { # recursive call for hashrefs
remove_undef_values( $r->{$key} );
}
defined $r->{$_} || delete $r->{$_} for keys %$r; # p $r;
}
}
sub _trim {
my $data = shift; # p $data;
KEY:
for my $key ( keys %$data ) { # say "$key: $data->{$key}"; # p $data->{$key};
if ( ref($data->{$key}) eq 'HASH' ) { # recursive call for hashrefs
# warn $data->{$key};
_trim( $data->{$key} ); # p $r->{$key};
next KEY; # not sure why this is necessary ??
} # say "$key: $data->{$key}";
# remove leading/trailing space chars (incl. unicode 160) from string:
$data->{$key} =~ s/^\p{Space}//g;
$data->{$key} =~ s/\p{Space}$//g; # p $data->{$key};
} # p $data;
return 0; # no need - modifying hashref in situ
}