#!/usr/bin/perl =begin -------------------------------------------------------------------------------- * generates XML data file for genomics data, ftp transfer, archive as gzip * also transfers consent forms (or any file in request dir) with ctime within MAX_AGE * run from cron - set duration in arg eg "genomics_xml.pl -t 300" * on test server only: run as root/www-data or directory read permissions denied * set ppm_server -> remote_addr -> localhost in settings.cfg to auto-set $JUST_TESTING * validated using XML::Validator::Schema only supports 'unqualified' attributeFormDefault and elementFormDefault does not recognise xs:long for nhs_number - use xs:positiveInteger instead * on ftp file transfer failure, xml/pdf files retained in /tmp, also xml files in /backups/genomics, also can run script manually passing -t seconds from 1st ftp transfer failure -------------------------------------------------------------------------------- =cut BEGIN { use constant MAX_AGE => 3600; # default if no value passed as -t use Getopt::Std; getopts('t:d:q'); # time (seconds), database (set by .t), sql trace our($opt_t,$opt_d,$opt_q); $ENV{SQL_TRACE} = $opt_q; } # warn $opt_d; warn $opt_t; exit; #=============================================================================== my $duration = $opt_t || MAX_AGE; # warn $duration; # seconds my $database = $opt_d || 'genomics'; # warn $database; my $JUST_TESTING = 0; # skips file transfer (is auto-set to 1 for dev) my %contacts = ( sheffield => [ 'freyja.docherty', 'tamar.kammin', ], common => [ 'denise.hancock', 'bridget.morgan', 'raj', ], leeds => [], # just use 'common' contacts ); my $xsd = 'genomics.xsd'; # current xsd - synlinked to most recent file #=============================================================================== use lib ( '/home/raj/perl5/lib/perl5', '/home/raj/apps/HILIS4/lib', '/home/raj/perl-lib', # Local::XMLSimple - patched to escape single-apostrophe ); use IO::Compress::Gzip qw(gzip $GzipError); use LIMS::Local::GenomicsValidation; use LIMS::Local::ScriptHelpers; use Local::XMLSimple qw(:strict); # need to specify ForceArray & KeyAttr if using :strict here use Data::Printer alias => 'p', use_prototypes => 0; use FindBin qw($Bin); # warn $Bin; exit; use File::Basename; use Data::Dumper; use Modern::Perl; use Local::Utils; use Path::Tiny; use Local::DB; use IO::All; my $cfg_file = $Bin . '/../lib/settings.cfg'; # p $cfg_file; my $settings = Config::Auto::parse($cfg_file); # p $settings; my $time_now = Local::Utils::time_now; my $xsd_src = $Bin . '/../../../setup/schema/xsd/' . $xsd; # p $xsd_src; my $ppm_settings = $settings->{ppm_server}; # p $ppm_settings; my $genomics_cfg = $settings->{genomics}; # p $genomics_cfg; my $genomics_uat = $genomics_cfg->{uat}; # p $genomics_uat; exit; #=============================================================================== # override $JUST_TESTING if test script or dev box: $JUST_TESTING ||= ( $ENV{HARNESS_ACTIVE} || $ppm_settings->{remote_address} eq 'localhost' ); # warn $JUST_TESTING; #=============================================================================== # set some package vars (auto-stringify datetime objects & don't log queries): $Local::QueryLogger::NO_QUERY_LOGS = 1; # don't need queries in logs dir $Local::DBIx::Simple::Result::STRINGIFY_DATETIME = 1; # as we hand $data straight to XMLout() # recent XML::Simple version (2.21) outputs uninitialized warnings on undef fields: # no warnings "XML::Simple"; # kills hmds_test - need to delete undefs in data blocks # get tools from LIMS::Local::ScriptHelpers: my $tools = LIMS::Local::ScriptHelpers->new(); my $dbix = Local::DB->dbix({ dbname => $database }); my $sql_lib = $tools->sql_lib(); my $config = $tools->config(); # for email validation failure my $validator = LIMS::Local::GenomicsValidation ->new(dbix => $dbix, cfg => $settings, xsd_src => $xsd_src); my %SQL = ( demographics => $sql_lib->retr('genomics_demographics'), # unit_number => $sql_lib->retr('genomics_unit_number'), # value loaded in demograhics request_ids => $sql_lib->retr('genomics_requests'), storage => $sql_lib->retr('genomics_storage'), results => $sql_lib->retr('genomics_results'), consent => $sql_lib->retr('genomics_consent'), ); # get request ids: my @request_ids = do { my ($sql, @bind) = _get_query_for_request_ids(); # p [$sql, @bind]; $dbix->query( $sql, @bind )->column; # new Local::DBIx::Simple::Result method }; # p \@request_ids; exit; #=============================================================================== exit unless @request_ids; #=============================================================================== my $timestamp = LIMS::Local::Utils::time_now(); use constant TMP_DIR => '/tmp'; # where to create temp data file use constant LOGFILE => '/home/raj/crons/genomics_transfer.log'; my @lab_sections = do { my @args = ( 'lab_sections', ['section_name'], { is_active => 'yes' } ); $dbix->select(@args)->column; # array }; # p \@lab_sections; # only need this if using xml element names for data feed: # my $test_element_map = get_lab_test_to_xml_element_map(); # p $test_element_map; exit; { # go: # get requests 1-to-1 data: my @requests = $dbix->query( $SQL{demographics}, @request_ids )->hashes; # p @requests; REQUEST: for my $req (@requests) { # p $req; # get results data: my $request_id = $req->{request_id}; # allocate GeL/participant id if screened and not already done: if ( my $arm = $req->{arm} ) { # if screened my $gel_id = $genomics_uat->{rare_disease} + $request_id; # 122_000_000 + req_id # add increment if screened as cancer (ie to make 223_000_000 + req_id) $gel_id += $genomics_uat->{cancer_incr} if $arm =~ /cancer/i; # p $gel_id; # update unit_number -> participant_id if not already: unless ( _matches($req->{participant_id}, $gel_id) ) { $req->{participant_id} = $gel_id; update_participant_id($req); } } else { # will be invalid xml entry 'UNKNOWN' so delete: delete $req->{participant_id}; } my %req_fields = map +($_ => $req->{$_}), # skip unrequired fields: grep { $_ !~ /\b(year|request_number|patient_case_id)\b/ } # skip undef vals to avoid uninitialized val warns from XML::Simple grep defined $req->{$_}, keys %$req; # p \%req_fields; # remove trailing 'cancer' from disease_type_registration: $req_fields{disease_type_registration} =~ s/\scancer\Z//i if $req_fields{disease_type_registration}; # only exists for cancer # lab number: $req_fields{lab_number} = join '/', $req->{request_number}, $req->{year} - 2000; # p \%req_fields; # build data structure for this request: my %h = ( Demographics => \%req_fields ); # p \%h; SECTION: for my $section_name ( @lab_sections ) { # p $section_name; # skip unless $section_name in $test_element_map keys: # my $section_element_map = $test_element_map->{uc($section_name)} # or next SECTION; # p $section_element_map; my @data = do { my @bind = ( $request_id, $section_name ); $dbix->query( $SQL{results}, @bind )->hashes; }; # p \@data; # remove any non-word chars from section name -> CamelCase: my $section_element_name = join '', map ucfirst $_, grep $_ !~ /\d/, split '\W', $section_name; # p $section_element_name; RESULT: # each test/result pair for this section (query ensures no undef vals) for my $ref (@data) { # hashref of keys: test_name & result my $test_name = $ref->{test_name}; =begin # this is only needed to map test_names to GeL xml element names: # get XML element name or skip test (not required in XML data): my $test_element_name = $section_element_map->{$test_name} or next RESULT; # p $element_name; # add element name and result to section data: $h{$section_element_name}{$test_element_name} = $ref->{result}; =cut # if NOT using GeL xml element names: $h{$section_element_name}{$test_name} = $ref->{result}; } } # fluidx storage (1-to-many with request_id): if ( my @data = $dbix->query( $SQL{storage}, $request_id )->hashes ) { # ensure no undef values or XML::Simple v2.21+ issues uninitialized value warnings: remove_undef_values($_) for @data; $h{Storage}{Vials}{vial} = \@data; # p \@data; } { # consent taken & withdrawn - from request_lab_test_history file: my $data = $dbix->query( $SQL{consent}, $request_id ) ->map_hashes('action'); if ( my $event = $data->{consent_given} ) { my @user = @{$event}{ qw/first_name last_name/ }; $h{Consent}{consent_taken_by} = join ' ', map ucfirst $_, @user; } if ( my $event = $data->{consent_withdrawn} ) { # infrequently used: my @user = @{$event}{ qw/first_name last_name/ }; $h{ConsentWithdrawal}{withdrawal_taken_by} = join ' ', map ucfirst $_, @user; } } # p \%h; { # consent form filename (need to allow for >1 file): my $filepath = get_destination_sub_dir($req); # p $filepath; if ( -e $filepath ) { my @contents = io($filepath)->all; FILE: for my $f (@contents) { # p $f; next FILE unless $f->type eq 'file'; my $filename = join '~', $request_id, $f->filename; # add as array in case >1 file in dir push @{ $h{Consent}{Files}{file} }, $filename; my $file_age = ( $timestamp->epoch - $f->ctime ); # p $file_age; # in seconds next FILE if $file_age > MAX_AGE; # p [ $f->filename, $age ]; { # file < MAX_AGE so transfer it (returns 0 if JUST_TESTING): my %args = ( local_filename => $f->name, remote_filename => $filename, ); # p \%args; ftp_file(\%args); } } } } # p \%h; # validate params for patterns that cannot be done by XML validation libs # in validate_xml_against_xsd(): if ( my $errs = $validator->validate_params(\%h) ) { # p $errs; # arrayref unless ($JUST_TESTING) { notify_admin( { data => \%req_fields, error => $_ } ) for @$errs; } else { say $req_fields{participant_id}, ' ', $_ for @$errs; } next REQUEST unless $ENV{HARNESS_ACTIVE}; # need file to be generated } { # create and validate xml file: my $content = to_xml(\%h); # say $content; # validate xml, returns XML::SAX::Exception object on failure, undef on success: if ( my $error = $validator->validate_xml_against_xsd($content) ) { # warn $error; unless ($JUST_TESTING) { my %args = ( data => \%req_fields, error => $error ); notify_admin(\%args); next REQUEST; } else { # need the xml file for genomics.t or tests fail say $req_fields{participant_id}, ' ', $error; } } my $local_name = $ENV{HARNESS_ACTIVE} ? "${database}.xml" : sprintf '%s_%s.xml', $request_id, $timestamp->strftime('%Y_%m%d_%H%M%S'); # p $local_name; my $full_path = join '/', TMP_DIR, $local_name; # p $full_path; $content > io($full_path); if ( $ENV{HARNESS_ACTIVE} ) { # dump %h to file and exit loop: io(join '/', TMP_DIR, "${database}.txt")->print(Dumper \%h); } { # transfer file (returns 0 if JUST_TESTING): my %args = ( local_filename => $full_path, remote_filename => $local_name, ascii_mode => 1, # override default binary mode ); ftp_file(\%args); } #=================================================================== next REQUEST if $JUST_TESTING; # don't archive - .xml retained in /tmp #=================================================================== { # archive in genomics_archive dir: my $archive_dir = $ppm_settings->{genomics_archive} or die 'no archive dir specified in settings.cfg'; my $archive = join '/', $archive_dir, $local_name; my $z = new IO::Compress::Gzip($archive . '.gz') or die $!; $z->write($content); $z->close(); } } } } #------------------------------------------------------------------------------- sub ftp_file { my $args = shift; my $remote_filename = $args->{remote_filename}; my $local_filename = $args->{local_filename}; # p $local_filename; my $ascii_mode = $args->{ascii_mode} || 0; # optional - only for xml files my %params = ( local_filename => $local_filename, remote_filename => $remote_filename, server_addr => $ppm_settings->{remote_address}, username => $ppm_settings->{username}, password => $ppm_settings->{password}, cwd => 'HILIS_GEL', # destination_dir for genomics data passive_mode => 1, ascii_mode => $ascii_mode, # override default binary mode in ScriptHelpers::ftp_file() ); # p \%params; #=============================================================================== return if $JUST_TESTING; # say "about to transfer $remote_filename"; #=============================================================================== { # ftp file (returns str on failure, undef on success): my $rtn = $tools->ftp_file(\%params); # p $rtn; if ($rtn) { # p $rtn; my $script = File::Basename::basename($0); my $time = $timestamp->strftime('%Y-%m-%d %T'); my $msg = join ': ', $remote_filename, $rtn; warn "$script [$time] $msg"; # dump to logs but don't die!! - no smtp on test: $tools->mail_admin({ script => $script, msg => $msg }); my $logfile_msg = format_msg($rtn); io(LOGFILE)->append($logfile_msg); } else { my $msg = format_msg($local_filename); io(LOGFILE)->append($msg); # only delete xml file, not consent forms!! my $tmp_dir = TMP_DIR; # p $tmp_dir; # TMP_DIR doesn't work used direct in regex if ( $local_filename =~ m!^$tmp_dir/! ) { # say "deleting $local_filename"; io($local_filename)->unlink; } } } } #------------------------------------------------------------------------------- sub to_xml { my $data = shift; # arrayref of request datasets # remove leading/trailing white-space (eg introduced by pasting from excel): _trim($data); # ensure data conforms to XSD: # _validate_for_xsd($data); # replaced with validate_xml_against_xsd() so vals get fixed # options for XMLout (needs array or hash): my %xs_opts = xs_options(); # p %xs_opts; my $xs = Local::XMLSimple->new(%xs_opts); my $ref = { FileCreationDateTime => $timestamp->datetime, # require string # add direct if NOT using xml element names, otherwise use format_request(): # Record => format_request($data), Record => $data, }; # p $ref; # stringify datetime objects (as XML::Simple can't): # transform_datetime_to_string($ref); using $Local::DBIx::Simple::Result::STRINGIFY_DATETIME # enclose xml in outer block; add 'version' inline: # my $input = { add => $ref, version => 2 }; my $xml_out = $xs->XMLout($ref); # p $xml_out; return $xml_out; } #------------------------------------------------------------------------------- sub notify_admin { my $args = shift; # href of data & error my $error = $args->{error}; # p $error; # stringifies as scalar my $data = $args->{data}; # p $data; # href # generate file id, either participant_id (if screened) or lab no: my $file_id = $data->{participant_id} || $data->{lab_number}; my $message = join ': ', $file_id, $error; my $centre = ( grep $data->{organisation_code} !~ /^$_/, qw/RHQ RCU/ ) ? 'sheffield' : 'leeds'; # 3 Sheffield, all others Leeds (Harrogate, Bradford, etc) my @recipients = ( @{ $contacts{$centre} }, @{ $contacts{common} } ); my $subject = sprintf 'Genomics [%s] xml validation failure', uc $centre; # send email to admin: my %mail = ( config => $config, subject => $subject, message => $message, ); # p \%mail; p \@recipients; $tools->send_mail(\%mail, \@recipients); } #------------------------------------------------------------------------------- sub xs_options { my %opts = ( SuppressEmpty => 0, # ? only works for xml_in NoEscape => 0, # should be default anyway, but doesn't work RootName => 'HILIS4Genomics', XMLDecl => q!!, KeyAttr => [], NoAttr => 1, # gives inline (scalar) AND nested (arrayref) attributes ); return %opts; } #------------------------------------------------------------------------------- sub get_destination_sub_dir { my $ref = shift; # p $ref; my ($request_num, $year) = ($ref->{request_number}, $ref->{year}); # p [$year, $request_num]; my $i = int ( $request_num / 100 ); # warn $i; # 1-99 = 0, 100-199 = 1, 200-299 = 2, etc my $mini_dir = sprintf '%s-%s', 100 * $i || 1, # default to 1 if 0; 1, 100, 200, 300, etc 100 * $i + 99; # 99, 199, 299, etc my $app_dir = path($tools->path_to_app_root)->realpath; # p $app_dir; my $rel_path = 'static/files/genomics'; my $destination_dir = join '/', # eg /path/to/uploads/2013 $app_dir, $rel_path, $year, $mini_dir, $request_num; # p $destination_dir; return $destination_dir; } sub remove_undef_values { # based on kaitlyn's transform_datetime_to_string() below: my $r = shift; # p $r; return unless ref $r eq 'HASH'; for my $key( keys %$r ) { # p [$key, $r->{$key}]; if ( ref $r->{$key} eq 'HASH' ) { # recursive call for hashrefs remove_undef_values( $r->{$key} ); } defined $r->{$_} || delete $r->{$_} for keys %$r; # p $r; } } sub update_participant_id { my $req = shift; # p $req; my $patient_case_id = $req->{patient_case_id}; # p $patient_case_id; my $participant_id = $req->{participant_id}; # p $participant_id; my $request_id = $req->{request_id}; my $result = $dbix->update('patient_case', { unit_number => $participant_id }, { id => $patient_case_id }); if ($result->rows) { # patient_case updated: my $user_id = $dbix->select('users', ['id'], { username => 'crontab' })->value; my %h = ( request_id => $request_id, user_id => $user_id, action => 'auto-set participant ID', ); $dbix->insert('request_history', \%h); } else { warn sprintf "participant id %s failed to update on request id %s", $participant_id, $request_id; } } sub format_msg { my $str = shift; my $app_ver = get_app_version(); my $msg = sprintf "%s [%s]: %s\n", $timestamp->strftime('%Y-%m-%d %T'), $app_ver, $str; return $msg; } sub get_app_version { chdir '/home/raj/apps/HILIS4'; chomp( my $raw_time = `/usr/bin/git log -1 --format=%cd --date=raw` ); # warn $raw_time; # cmd appends new-line # extract epoch seconds eg 1455628027 +0000 [GMT], 1460022637 +0100 [BST], etc: my ($epoch) = $raw_time =~ /^(\d+)\s\+0[01]00/ or # don't use '||' here - gets truth die "could'nt extract epoch time from `git log` output"; # warn $epoch; my $vnumber = 999 + `/usr/bin/git rev-list HEAD --count`; # as 1st in git repo = svn #1000 return sprintf '%.4f', 4 + ( $vnumber / 10000 ); # ensure 4 digits } sub _get_query_for_request_ids { # TODO: replace this with SQLA::More method my $sql = $SQL{request_ids}; # count number of placeholders in $sql: my $n = () = $sql =~ /\?/g; # p $n; # bind one $duration per placeholder: my @bind = map $duration, (1..$n); # p \@bind; return ($sql, @bind); } =begin # SQLA::More method - works but far too slow (20sec vs 2sec) - due to position of 'th.action' much slower when part of 'where' rather than 'join' sub _get_query_for_request_ids { my $interval = $time_now->subtract( seconds => $duration ); # p $interval; my @cols = ( 'DISTINCT(r.id)' ); my @rels = ( # table|alias # FK-PK relationship 'requests|r' , 'r.patient_case_id = pc.id' , 'patient_case|pc' , 'pc.patient_id = p.id' , 'patients|p' , '=>tr.request_id = r.id' , 'request_lab_test_results|tr' , '=>th.request_id = r.id' , 'request_lab_test_history|th' , '=>rh.request_id = r.id' , 'request_history|rh' , ); my %where = ( -or => [ 'r.created_at' => { '>=' => $interval }, 'r.updated_at' => { '>=' => $interval }, 'p.updated_at' => { '>=' => $interval }, 'tr.time' => { '>=' => $interval }, -and => [ 'th.action' => { like => 'uploaded %' }, # THIS IS VERY SLOW HERE 'th.time' => { '>=' => $interval }, ], -and => [ 'rh.action' => 'screened', 'rh.time' => { '>=' => $interval }, ], ], ); my @args = ( -columns => \@cols, -from => [ -join => @rels ], -where => \%where, ); # p @args; my ($sql, @bind) = SQL::Abstract::More->new->select(@args); $dbix->dump_query($sql, @bind); exit; return ($sql, @bind); } =cut sub _matches { Local::Utils::matches(@_) } sub _trim { my $data = shift; # p $data; KEY: for my $key ( keys %$data ) { # say "$key: $data->{$key}"; # p $data->{$key}; if ( ref($data->{$key}) eq 'HASH' ) { # recursive call for hashrefs # warn $data->{$key}; _trim( $data->{$key} ); # p $r->{$key}; next KEY; # not sure why this is necessary ?? } # say "$key: $data->{$key}"; # remove leading/trailing space chars (incl. unicode 160) from string: $data->{$key} =~ s/^\p{Space}//g; $data->{$key} =~ s/\p{Space}$//g; # p $data->{$key}; } # p $data; return 0; # no need - modifying hashref in situ } =begin # unused methods: # map of lab-section => test name => XML element name sub get_lab_test_to_xml_element_map { # only needed if using xml element names my @data = ; # p @data; my %map; for (@data) { chomp; # say $_; my ($section, $test_name, $element_name) = split ':', $_; # p [$section, $test_name, $element_name]; next unless $section; $map{$section}{$test_name} = $element_name; } # p \%map; return \%map; } # formats repeating request blocks (only needed if using xml element names): sub format_request { my $r = shift; # p $r; my %data; { # registration/demographics: my $ref = $r->{Demographics}; my %h = ( nhs_number => $ref->{nhs_number}, date_of_birth => $ref->{dob}, last_name => $ref->{last_name}, first_name => $ref->{first_name}, gender => $ref->{gender}, ); $data{Registration} = \%h; } # p \%data; return \%data; } # for use if date(time) value is an object: sub transform_datetime_to_string { my $r = shift; return unless ref $r eq 'HASH'; for my $key ( keys %$r ) { if ( ref $r->{$key} eq 'HASH' ) { # recursive call for hashrefs transform_datetime_to_string( $r->{$key} ); } if ( ref $r->{$key} =~ /^DateTime/ ) { $r->{$key} = sprintf "%s", $r->{$key}; # quote to force to string } } } =cut __DATA__ CONSENT:consent_date:date-of-consent CONSENT:consent_form_version:name-and-version-of-consent-form CONSENT:consent_taken:consent-given CONSENT:info_sheet_version:name-and-version-of-participant-information-sheet CONSENT:consent_q1:consent-question-1 CONSENT:consent_q2:consent-question-2 CONSENT:consent_q3:consent-question-3 CONSENT:consent_q4:consent-question-4 CONSENT:consent_q5:consent-question-5 CONSENT:consent_q6:consent-question-6 CONSENT:consent_q7:consent-question-7 CONSENT:consent_q8:consent-question-8 CONSENT:consent_q9:consent-question-9 CONSENT:consent_q10:consent-question-10 CONSENT:consent_q11:consent-question-11 CONSENT:consent_q12:consent-question-12 CONSENT:consent_q13:consent-question-13 CONSENT:consent_q14:consent-question-14 SPECIMENS:edta1:DNA Blood Germline SPECIMENS:pst:LiHep Plasma SPECIMENS:paxgene_rna:RNA Blood SPECIMENS:sst:Serum SPECIMENS:handling_protocol:Laboratory Method STORAGE:vial_id:Laboratory Sample ID STORAGE:rack_id:GMC Rack ID STORAGE:rack_location:GMC Rack Well DNA:edta1_qc_date:Test Result DateTime DNA:edta1_qc_type:Test Result Type DNA:edta1_qc_result:Test Result Value DNA:edta1_fluidx:Laboratory Sample ID DNA:edta1_fluidx_vol:Laboratory Sample Volume DNA:edta1_fluidx_rack_id:GMC Rack ID DNA:edta1_fluidx_rack_well:GMC Rack Well DNA:edta1_fluidx_vol:Laboratory Remaining Volume Banked DISPATCH:consignment_number:GMC Sample Consignment Number DISPATCH:dna_dispatched:GMC Sample Dispatch Date DISPATCH:omics_dispatched:GMC Sample Dispatch Date