#!/usr/bin/perl # NB: this file is symlinked from crons/weekly/genomics_validation.pl =begin validates xml from genomics requests; uses same validation function as genomics_xml.pl doesn't save xml file, or update any database table, or push to any remote location can also be run from cron (requires symlink) to email results - pass -m flag as arg don't run at midnight - takes several minutes to complete pass -t flag with -m to dump results to file and skip email pass -q flag to skip 'validated OK' output =cut use lib ( '/home/raj/perl5/lib/perl5', '/home/raj/apps/HILIS4/lib', '/home/raj/perl-lib', # Local::XMLSimple - patched to escape single-apostrophe ); use Getopt::Std; getopts('mtqd:n:r:x:'); our( $opt_d, # (d)atabase $opt_m, # (m)essage - cron request $opt_t, # (t)esting $opt_q, # (q)uiet (skip OK's) $opt_r, # (r)equest_id $opt_n, # mi(n) request_id $opt_x, # ma(x) request_id ); # warn $opt_d; exit; #=============================================================================== my @recipients = ('raj','sarah.fitzgerald'); # only used if -m flag passed from cron my $JUST_TESTING = $opt_t || 0; # can't pass undef to $tools->test_only my $cron_request = $opt_m; my $skip_valid = $opt_q; # don't print "validated OK" my $database = $opt_d || 'genomics'; # say $database; exit; #=============================================================================== use Data::Printer alias => 'p', use_prototypes => 0; use Spreadsheet::WriteExcel::Simple; use LIMS::Local::GenomicsValidation; use LIMS::Local::ScriptHelpers; use Local::XMLSimple qw(:strict); # need to specify ForceArray & KeyAttr if using :strict here use Encoding::FixLatin qw/fix_latin/; use FindBin qw($Bin); # warn $Bin; exit; use Data::Dumper; use Modern::Perl; use Path::Tiny; use Local::DB; use IO::All; use charnames ':full'; # for perl < 5.16 - to use unicode names (eg APOSTROPHE) my $app_dir = $cron_request # symlink from crons/weekly dir ? path($Bin . '/../../..')->realpath : path($Bin . '/..')->realpath; # warn $app_dir; my $cfg_file = $app_dir . '/script/crons/lib/settings.cfg'; # p $cfg_file; my $xsd_src = $app_dir . '/setup/schema/xsd/genomics_v3.2.xsd'; # p $xsd_src; my $timestamp = LIMS::Local::Utils::time_now(); my $settings = Config::Auto::parse($cfg_file); # p $settings; exit; my $tools = LIMS::Local::ScriptHelpers->new(); my $dbix = Local::DB->dbix({ dbname => $database }); my $genomics_cfg = $settings->{genomics}; # p $genomics_cfg; my $genomics_uat = $genomics_cfg->{uat}; # p $genomics_uat; exit; my $sql_lib = $tools->sql_lib(); my $config = $tools->config(); $tools->test_only($JUST_TESTING); # for email via cron use only my $validator = LIMS::Local::GenomicsValidation ->new(dbix => $dbix, cfg => $settings, xsd_src => $xsd_src); # set some package vars (auto-stringify datetime objects & don't log queries): $Local::DBIx::Simple::Result::STRINGIFY_DATETIME = 1; # as we hand $data straight to XMLout() my %SQL = ( demographics => $sql_lib->retr('genomics_demographics'), storage => $sql_lib->retr('genomics_storage'), results => $sql_lib->retr('genomics_results'), consent => $sql_lib->retr('genomics_consent'), ); my @lab_sections = do { my @args = ( 'lab_sections', ['section_name'], { is_active => 'yes' } ); $dbix->select(@args)->column; # array }; # p \@lab_sections; # get request ids: my @request_ids = $dbix->select('requests', 'id')->column; # p \@request_ids; exit; my @validation_errs; my %referral_locations; # for xl file output { # go: # get requests 1-to-1 data: my @requests = $dbix->query( $SQL{demographics}, @request_ids )->hashes; # p \@requests; exit; REQUEST: for my $req (@requests) { # p $req; # get results data: my $request_id = $req->{request_id}; if ($opt_r) { next unless $request_id == $opt_r } if ($opt_n) { next unless $request_id >= $opt_n } if ($opt_x) { last if $request_id > $opt_x } # allocate GeL/participant id if screened and not already done: if ( $req->{participant_id} eq 'UNKNOWN' ) { # unit number still at default if ( my $arm = $req->{arm} ) { # if screened my $base_id = $genomics_uat->{rare_disease}; # 122_000_000 # add cancer increment if screened as cancer (ie to make 223_000_000) $base_id += $genomics_uat->{cancer_incr} if lc $arm eq 'cancer'; $req->{participant_id} = $base_id + $request_id; } else { delete $req->{participant_id}; # will be invalid xml entry 'UNKNOWN' } } my %req_fields = map +($_ => $req->{$_}), # skip unrequired fields: grep { $_ !~ /\b(year|request_number|patient_case_id)\b/ } # skip undef vals to avoid uninitialized val warns from XML::Simple grep defined $req->{$_}, keys %$req; # p \%req_fields; # remove trailing 'cancer' from disease_type_registration: $req_fields{disease_type_registration} =~ s/\scancer\Z//i if $req_fields{disease_type_registration}; # only exists for cancer # lab number: $req_fields{lab_number} = join '/', $req->{request_number}, $req->{year} - 2000; # p \%req_fields; # build data structure for this request: my %h = ( Demographics => \%req_fields ); # p \%h; SECTION: for my $section_name ( @lab_sections ) { # p $section_name; my @data = do { my @bind = ( $request_id, $section_name ); $dbix->query( $SQL{results}, @bind )->hashes; }; # p \@data; # remove any non-word chars from section name -> CamelCase: my $section_element_name = join '', map ucfirst $_, grep $_ !~ /\d/, split '\W', $section_name; # p $section_element_name; RESULT: # each test/result pair for this section (query ensures no undef vals) for my $ref (@data) { # hashref of keys: test_name & result my $test_name = $ref->{test_name}; $h{$section_element_name}{$test_name} = _escape($ref->{result}); } } # fluidx storage (1-to-many with request_id): if ( my @data = $dbix->query( $SQL{storage}, $request_id )->hashes ) { # ensure no undef values or XML::Simple v2.21+ issues uninitialized value warnings: remove_undef_values($_) for @data; $h{Storage}{Vials}{vial} = \@data; # p \@data; } { # consent taken & withdrawn - from request_lab_test_history file: my $data = $dbix->query( $SQL{consent}, $request_id ) ->map_hashes('action'); if ( my $event = $data->{consent_given} ) { my @user = @{$event}{ qw/first_name last_name/ }; $h{Consent}{consent_taken_by} = join ' ', map ucfirst $_, @user; } if ( my $event = $data->{consent_withdrawn} ) { # infrequently used: my @user = @{$event}{ qw/first_name last_name/ }; $h{ConsentWithdrawal}{withdrawal_taken_by} = join ' ', map ucfirst $_, @user; } } # p \%h; { # consent form filename (need to allow for >1 file): my $filepath = get_destination_sub_dir($req); # p $filepath; if ( -e $filepath ) { my @contents = io($filepath)->all; FILE: for my $f (@contents) { # p $f; next FILE unless $f->type eq 'file'; # remove non-latin chars from filename: my $filename = join '~', $request_id, _escape($f->filename); # add as array in case >1 file in dir push @{ $h{Consent}{Files}{file} }, $filename; } } } # p \%h; my $lab_no = $req_fields{lab_number}; $referral_locations{$lab_no} = $req->{referral_source}; # validate params before creating xml: if ( my $errs = $validator->validate_params(\%h) ) { # p $errs; # arrayref for my $err(@$errs) { my $str = $lab_no . ' ' . $err; $cron_request ? push @validation_errs, $str : say $str; } next REQUEST; } # p \%h; # create xml: my $xml = to_xml(\%h); # say $xml; # validate xml, returns XML::SAX::Exception object on failure, undef on success: if ( my $error = $validator->validate_xml_against_xsd($xml) ) { if ( $error =~ m!element <.*>! ) { # replace with label & section $error = $validator->reformat_error($error); } my $str = join ' ', $lab_no, $error; $cron_request ? push @validation_errs, $str : say $str; } else { next REQUEST if $cron_request || $skip_valid; # not interested in valid records my $str = $lab_no . ' validated OK'; say $str; } } } # if cron request to email errors ($cron_request is 'true'): if (@validation_errs) { # my $message = join "\n", @validation_errs; p $message; my $filename = 'genomics_validation_failures.xls'; my $subject = 'Genomics requests validation failures'; my @headers = qw(labno source section field result failed reason); my $xl = Spreadsheet::WriteExcel::Simple->new; $xl->write_bold_row(\@headers); for (@validation_errs) { s/'//g; # strip quote marks to simplify regex # p $_; next; my ($labno, $result, $field, $section, $error) = # don't capture preceding "value" eg: # Illegal value 'WF2342' for 'Unit number'; does capture "NULL value" m!^(\d+/\d+) illegal(?: value)? (.*) for (.*) in (.*) section, (.*)!i; # p [$labno, $result, $field, $section, $error]; # which GenomicsValidation method failed it - # validate_xml_against_xsd() XML::SAX parser or regexes in validate_params(): my ($failed) = $error =~ /required pattern/ ? 'xml' : 'data'; # vs "required definition" my $location = $referral_locations{$labno}; my @data = ( $labno, $location, $section, $field, $result, $failed, $error ); # p \@data; $xl->write_row(\@data); } my %mail = ( config => $config, subject => $subject, attachment => $xl->data, filename => $filename, ); $JUST_TESTING ? $xl->save($Bin . '/genomics_validation.xls') : $tools->send_mail(\%mail, \@recipients); } #------------------------------------------------------------------------------- sub to_xml { my $data = shift; # arrayref of request datasets # remove leading/trailing white-space (eg introduced by pasting from excel): _trim($data); # options for XMLout (needs array or hash): my %xs_opts = xs_options(); # p %xs_opts; my $xs = Local::XMLSimple->new(%xs_opts); my $ref = { FileCreationDateTime => $timestamp->datetime, # require string # add direct if NOT using xml element names, otherwise use format_request(): # Record => format_request($data), Record => $data, }; # p $ref; # enclose xml in outer block; add 'version' inline: # my $input = { add => $ref, version => 2 }; my $xml_out = $xs->XMLout($ref); # p $xml_out; return $xml_out; } #------------------------------------------------------------------------------- sub xs_options { my %opts = ( SuppressEmpty => 0, # ? only works for xml_in NoEscape => 0, # should be default anyway, but doesn't work RootName => 'HILIS4Genomics', XMLDecl => q!!, KeyAttr => [], NoAttr => 1, # gives inline (scalar) AND nested (arrayref) attributes ); return %opts; } #------------------------------------------------------------------------------- sub get_destination_sub_dir { my $ref = shift; # p $ref; my ($request_num, $year) = ($ref->{request_number}, $ref->{year}); # p [$year, $request_num]; my $i = int ( $request_num / 100 ); # warn $i; # 1-99 = 0, 100-199 = 1, 200-299 = 2, etc my $mini_dir = sprintf '%s-%s', 100 * $i || 1, # default to 1 if 0; 1, 100, 200, 300, etc 100 * $i + 99; # 99, 199, 299, etc my $app_dir = path($tools->path_to_app_root)->realpath; # p $app_dir; my $rel_path = 'static/files/genomics'; my $destination_dir = join '/', # eg /path/to/uploads/2013 $app_dir, $rel_path, $year, $mini_dir, $request_num; # p $destination_dir; return $destination_dir; } #------------------------------------------------------------------------------- sub remove_undef_values { # based on kaitlyn's transform_datetime_to_string() below: my $r = shift; # p $r; return unless ref $r eq 'HASH'; for my $key( keys %$r ) { # p [$key, $r->{$key}]; if ( ref $r->{$key} eq 'HASH' ) { # recursive call for hashrefs remove_undef_values( $r->{$key} ); } defined $r->{$_} || delete $r->{$_} for keys %$r; # p $r; } } sub _escape { # substitute incompatible chars: my $str = shift; # p fix_latin($str); # takes mixed encoding input and produces UTF-8 output (fixes \x{92} apostrophe): return fix_latin($str); } sub _trim { my $data = shift; # p $data; KEY: for my $key ( keys %$data ) { # say "$key: $data->{$key}"; # p $data->{$key}; if ( ref($data->{$key}) eq 'HASH' ) { # recursive call for hashrefs # warn $data->{$key}; _trim( $data->{$key} ); # p $r->{$key}; next KEY; # not sure why this is necessary ?? } # say "$key: $data->{$key}"; # remove leading/trailing space chars (incl. unicode 160) from string: $data->{$key} =~ s/^\p{Space}//g; $data->{$key} =~ s/\p{Space}$//g; # p $data->{$key}; } # p $data; return 0; # no need - modifying hashref in situ }