#!/usr/bin/env perl =begin ------------------------------------------------------------------------- NGIS data extraction to csv - shares code with geneq_data_extraction.pl usage: perl $0 [-tq] * --query|q = output query to stdout * --test|t = saves file locally * --reqId 10 --reqId 100 --reqId 10000, etc query assumes WGS where sample extracted in Genetics lab & processed at Illumina will need adapting if/when arrangement extended to non-WGS samples TODO: test name = WGS Germline and Tumour, test_id depends on phenotype (AML/ALL) TODO: what triggers extract, how often, what type of request TODO: germline sample - either use a new sample code or have sample_type_dispatched entries which are not shared with diagnostic sample (skin, etc) TODO: datetime_processing_complete is doing db query for dna extraction date in NGIS::get_processing_date - what result does csv file expect ? =cut #------------------------------------------------------------------------------- my $query_output = 0; # --query|q - output sql queries to console my $testing = 0; # --testing|t - saves file locally, doesn't email my @REQUEST_IDS = (); # --request_id 1 --request_id 2, etc my @TEST_NAMES = qw( wgs_referral ); # tests to trigger data transfer my @null_fields = qw(); # force fields to null if required (temporary measure) use Getopt::Long; GetOptions ( "testing|t" => \$testing, # flag "query|q" => \$query_output, # flag "reqId=i" => \@REQUEST_IDS, # int list (optional) ); use strict; use warnings; my $JUST_TESTING = $testing || 0; use lib '/home/raj/perl5/lib/perl5'; use FindBin qw($Bin); # warn $Bin; use Text::CSV; use Data::Printer; use SQL::Abstract::More; use lib $Bin . '/../../../lib'; use LIMS::Local::ScriptHelpers; use LIMS::Local::NGIS; use lib '/home/raj/perl-lib'; use Local::DB; my $tools = LIMS::Local::ScriptHelpers->new(); $tools->test_only($JUST_TESTING); my $config = $tools->config(); my $today = $tools->time_now; my $dbix = Local::DB->dbix({ dbname => 'hilis4' }); # p $dbix; $dbix->lc_columns = 0; # preserve mixed case on col names (now standardised) # get NGIS object: my $ngis = do { my %args = ( dbix => $dbix, query_output => $query_output, test_mode => $JUST_TESTING ); LIMS::Local::NGIS->new(%args); }; # p $ngis; $Local::DBIx::Simple::Result::NO_AUTO_DATE_INFLATION = 0; # force iso8601 datetime #------------------------------------------------------------------------------- my $filename = sprintf 'YNEGLH_HMDS_%s_dispatch.csv', $today->strftime('%Y%m%d_%H%M%S'); # warn $filename; # $filename = 'temp.csv'; # for testing $ngis->set_t0(); # start timer for data processing # need to supply our own injection_guard to override built-in sql injection # attack prevention which detects ';' in GROUP_CONCAT, but we don't need one # here as we have no user input: my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ ); my $query = do { # _get_main_query returns false if no request_id's to process my ($sql, @bind) = _get_main_query(\@REQUEST_IDS); # p $sql; p @bind; exit unless $sql; $dbix->query( $sql, @bind ); }; # get cols from query, except 'private' ones used for evaluations (eg _datetime_screened): my @headers = grep $_ !~ /^_/, $query->columns; # p @headers; # retrieve data from query: my $data = $query->hashes; $ngis->runtimer('query runtime'); # p $data; # pre-process data to reconstitute (some) molecular panels and split (some) # FISH panels, stores data inside $ngis object: $ngis->preprocess_data($data); # store col_headers & null_fields in ngis object for use in process_data(): $ngis->col_headers(\@headers); $ngis->null_fields(\@null_fields); open my $fh, ">:encoding(utf8)", "$Bin/$filename" or die $!; my $csv = Text::CSV->new ({ binary => 1, auto_diag => 1, eol => $/ }); $csv->print($fh, \@headers); # retrieve pre-processed data from $ngis: my @ngis_data = @{$ngis->data}; # p @ngis_data; # AoH for my $ref (@ngis_data) { # p $ref; my $row = process_data($ref); # p $row; # arrayref $csv->print($fh, $row); } close $fh or die $!; $ngis->runtimer('completed data processing'); if ($JUST_TESTING) { } else { # transfer file using unknown mechanism } #=============================================================================== sub process_data { my $ref = shift; # p $ref; # href # DoB datetime to date: if ( my $dob = $ref->{patient_dob} ) { $ref->{patient_dob} = $dob->ymd; } $ref->{expected_sample_number} = 2; # always tumour + germline samples $ref->{order_submitted} = 0; # will be submitted elsewhere # primary sample; 0 = germline sample, 1 = diagnostic sample: # TODO: or could use sample_type_dispatched if opts not shared with diagnostic sample $ref->{primary_sample} = $ref->{_specimen_type} =~ /germline/ ? 0 : 1; # assume received & dispatched are the same, avoid 2 identical query lookups: $ref->{sample_type_received} = $ref->{sample_type_dispatched}; # NGIS code for WGS (based on flow phenotype - AML/ALL): $ref->{test_id} = _get_ngis_code($ref); $ref->{sample_date_sent} = $today->ymd; # assumes daily procedure # determine date lab-test requested (replaced by datetime_order_received): # $ref->{datetime_test_requested} = get_test_request_date($ref); # will need to be revived my $data = $ngis->finalise_data($ref); # p $data; # adds some new fields, sets others undef return $data; } #------------------------------------------------------------------------------- sub _get_ngis_code { # regex on results_summary drop-down my $data = shift; my $results_summary = $data->{_results_summary}; if ( $results_summary =~ /AML|acute myeloid/ ) { return 'M80.1'; } elsif ( $results_summary =~ /ALL|acute lymphoblastic/ ) { return 'M91.1'; } else { # TODO: need to improve this - NGIS code is mandatory warn "no NGIS code for $data->{local_sample_id}"; return undef; } } #------------------------------------------------------------------------------- sub _get_main_query { # p $_[0]; # optional aref of request_id's # get request_id's for main query or return undef, triggers script exit: my @request_ids = @{ $_[0] } || do { my ($sql, @bind) = _get_requests(); # p $sql; p @bind; $dbix->query( $sql, @bind )->array; } or return undef; # p @request_ids; my $cols = _main_query_cols(); # these keep changing my @rels = ( 'requests|r' , q{r.patient_case_id=pc.id} => 'patient_case|pc' , q{pc.patient_id=p.id} => 'patients|p' , q{pc.referral_source_id=rs.id} => 'referral_sources|rs' , q{rs.parent_organisation_id=po.id} => 'parent_organisations|po' , q{r.referrer_department_id=rd.id} => 'referrer_department|rd' , q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd' , q{rd.referrer_id=ref.id} => 'referrers|ref' , q{ts.request_id=r.id} => 'request_lab_test_status|ts' , q{ts.status_option_id=so.id} => 'lab_test_status_options|so' , q{ts.lab_test_id=lt.id} => 'lab_tests|lt' , q{lt.lab_section_id=ls.id} => 'lab_sections|ls' , q{tr.request_id=r.id} => 'request_lab_test_results|tr' , q{tr.lab_test_id=lt2.id} => 'lab_tests|lt2' , q{rsp.request_id=r.id} => 'request_specimen|rsp' , q{rsp.specimen_id=s.id} => 'specimens|s' , q{rrs.request_id=r.id, rrs.lab_section_id=ls.id} => 'request_result_summaries|rrs' , # left joins: q{=>pd.patient_id=p.id} => 'patient_demographics|pd' , q{=>rst.request_id=r.id} => 'request_storage|rst' , q{=>rst.rack_id=sr.id} => 'storage_racks|sr' , q{=>rer.request_id=r.id} => 'request_external_ref|rer' , q{=>ro.request_id=r.id} => 'request_option|ro' , q{=>ro.option_id=ao.id} => 'additional_options|ao' , # will not be available if request not reported when file generated: q{=>rrv.request_id=r.id} => 'request_report_view|rrv' , q{=>rsd.request_id=r.id} => 'request_specimen_detail|rsd' , ); my %where = ( 'ls.section_name' => 'Whole genome sequencing', 'lt.test_name' => { -in => \@TEST_NAMES }, 'r.id' => { -in => \@request_ids }, ); my @args = ( -columns => $cols, -from => [ -join => @rels ], -where => \%where, -group_by => 'r.id', # for group_concat on specimen -order_by => 'r.id', # -limit => 5, # -offset => 350, ); # p @args; my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; return ($sql, @bind); } =begin # get data as single query - very slow sub _get_main_query { my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!; my $begin_date = $ref_date->clone->set_day(1); # first day of $ref_date month my %date_restriction = ( -between => [ # first & last day of ref_date month: $begin_date->ymd, # 1st day $begin_date->add(months => 1)->subtract(days => 1)->ymd, # last day ] ); my $excluded_parents = $ngis->excluded_parent_codes; my $lab_sections = $ngis->included_lab_sections; # my $excluded_lab_test_ids = $ngis->excluded_lab_test_ids; # my $excluded_test_name_re = $ngis->excluded_test_names; my $cols = _main_query_cols(); # these keep changing my @rels = ( 'requests|r' , q{r.patient_case_id=pc.id} => 'patient_case|pc' , q{pc.patient_id=p.id} => 'patients|p' , q{pc.referral_source_id=rs.id} => 'referral_sources|rs' , q{rs.parent_organisation_id=po.id} => 'parent_organisations|po' , q{r.referrer_department_id=rd.id} => 'referrer_department|rd' , q{rd.hospital_department_code=hd.id} => 'hospital_departments|hd' , q{rd.referrer_id=ref.id} => 'referrers|ref' , q{ts.request_id=r.id} => 'request_lab_test_status|ts' , q{ts.status_option_id=so.id} => 'lab_test_status_options|so' , q{ts.lab_test_id=lt.id} => 'lab_tests|lt' , q{lt.lab_section_id=ls.id} => 'lab_sections|ls' , q{rsv.request_id=r.id} => 'request_status_view|rsv' , q{ris.request_id=r.id} => 'request_initial_screen|ris' , q{ris.screen_id=s2.id} => 'screens|s2' , q{tr.request_id=r.id} => 'request_lab_test_results|tr' , q{tr.lab_test_id=lt2.id} => 'lab_tests|lt2' , q{rsp.request_id=r.id} => 'request_specimen|rsp' , q{rsp.specimen_id=s.id} => 'specimens|s' , # left joins: q{=>ro.request_id=r.id} => 'request_option|ro' , q{=>ro.option_id=ao.id} => 'additional_options|ao' , q{=>pd.patient_id=p.id} => 'patient_demographics|pd' , q{=>rt.request_id=r.id} => 'request_trial|rt' , q{=>rrv.request_id=r.id} => 'request_report_view|rrv' , q{=>rst.request_id=r.id} => 'request_storage|rst' , q{=>rst.rack_id=sr.id} => 'storage_racks|sr' , q{=>rsd.request_id=r.id} => 'request_specimen_detail|rsd' , q{=>rer.request_id=r.id} => 'request_external_ref|rer' , ); # probably not required # q{=>nd.ngis_indication_id=nci.id} => 'ngis_clinical_indications|nci', # q{=>rrv.diagnosis_id=d.id} => 'diagnoses|d' , # q{=>nd.diagnosis_id=d.id} => 'ngis_diagnosis|nd' , # q{=>ro.request_id=r.id} => 'request_option|ro' , # q{=>ro.option_id=ao.id} => 'additional_options|ao' , # q{=>pd.patient_id=p.id} => 'patient_demographics|pd' , # q{=>rt.request_id=r.id} => 'request_trial|rt' , my @where = ( # contains repeated elements (eg parent_code), so requires array 'rt.request_id' => undef, # not in clinical trial 'ls.section_name' => { -in => $lab_sections }, # deadly slow query without this 'lt.test_name' => { -in => \@TEST_NAMES }, 'so.description' => { '!=' => 'complete' }, # lab-test status 'rs.organisation_code' => { -not_rlike => '^NT' }, # private hospital 'po.parent_code' => { -not_rlike => '^(S|Z)' }, # scotland/ireland 'po.parent_code' => { -not_in => $excluded_parents }, # not required here ? # 'lt.id' => { -not_in => $excluded_lab_test_ids }, # 'lt.test_name' => { -not_rlike => $excluded_test_name_re }, ); # restrict on request id's if supplied, or date: if (@REQUEST_IDS) { push @where, ( 'r.id' => { -in => \@REQUEST_IDS } ); } else { push @where, ( 'DATE(ts.time)' => \%date_restriction ); } # p @where; my @args = ( -columns => $cols, -from => [ -join => @rels ], # contains repeated elements (eg parent_code), so requires "-and => aref" -where => { -and => \@where }, -group_by => [ 'lt.id', 'r.id' ], -having => { $is_private => undef }, # not private patient -order_by => 'r.id', # -limit => 5, # -offset => 350, ); # p @args; # need to supply our own injection_guard to override built-in sql injection # attack prevention which detects ';' in GROUP_CONCAT, but we don't need one # here as we have no user input: my $sqla = SQL::Abstract::More->new( injection_guard => qr/^$/ ); my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; return ($sql, @bind); } =cut #------------------------------------------------------------------------------- sub _get_requests { my $is_private = q!MAX( CASE WHEN ao.option_name = 'private' THEN 1 END )!; my $excluded_parents = $ngis->excluded_parent_codes; my $cols = 'r.id'; my @rels = ( 'requests|r' , q{r.patient_case_id=pc.id} => 'patient_case|pc' , q{pc.referral_source_id=rs.id} => 'referral_sources|rs' , q{rs.parent_organisation_id=po.id} => 'parent_organisations|po' , q{ts.request_id=r.id} => 'request_lab_test_status|ts' , q{ts.status_option_id=so.id} => 'lab_test_status_options|so' , q{ts.lab_test_id=lt.id} => 'lab_tests|lt' , q{rrs.request_id=r.id} => 'request_result_summaries|rrs' , q{rrs.lab_section_id=ls.id} => 'lab_sections|ls' , # left joins: q{=>ro.request_id=r.id} => 'request_option|ro' , q{=>ro.option_id=ao.id} => 'additional_options|ao' , q{=>rt.request_id=r.id} => 'request_trial|rt' , ); my @where = ( # contains repeated elements (eg parent_code), so requires array 'rt.request_id' => undef, # not in clinical trial 'ls.section_name' => 'Whole genome sequencing', 'lt.test_name' => { -in => \@TEST_NAMES }, 'so.description' => { '!=' => 'complete' }, # lab-test status 'rs.organisation_code' => { -not_rlike => '^NT' }, # private hospital 'po.parent_code' => { -not_rlike => '^(S|Z)' }, # scotland/ireland 'po.parent_code' => { -not_in => $excluded_parents }, ); my @args = ( -columns => $cols, -from => [ -join => @rels ], # contains repeated elements (eg parent_code), so requires "-and => aref" -where => { -and => \@where }, -group_by => 'r.id', # required for 'having' -having => { $is_private => undef }, # not private patient # -limit => 5, # -offset => 350, ); # p @args; my ($sql, @bind) = $sqla->select(@args); # p $sql; p \@bind; $dbix->dump_query($sql, @bind) if $query_output; # exit; return ($sql, @bind); } #------------------------------------------------------------------------------- sub _main_query_cols { my $local_lab_name = $ngis->local_lab_name; # p $local_lab_name; my @cols = ( 'CONCAT_WS("/", r.request_number, r.year - 2000)|local_sample_id', 'NULL|local_sample_id_sub', 'NULL|local_dna_number', 'NULL|local_dna_number_sub', 'p.id|internal_patient_id', 'r.id|internal_request_id', qq!"$local_lab_name"|booking_laboratory!, qq!"YNEGLH Leeds Shire"|processing_laboratory!, # for WGS qq!"Illumina"|testing_laboratory!, # for WGS 'NULL|ped_number', 'p.first_name|patient_first_name', 'p.last_name|patient_last_name', 'p.dob|patient_dob', 'p.gender|patient_sex', 'p.nhs_number', 'pd.address|patient_address', 'NULL|patient_district_of_residence', 'NULL|patient_county', 'NULL|patient_city', 'pd.post_code|patient_postcode', 'NULL|patient_country', 'ref.national_code|referrer_unique_code', 'NULL|referrer_email', 'NULL|referrer_telephone', 'NULL|referrer_title', q!SUBSTRING_INDEX(ref.name, ' ', -1)|referrer_first_name!, q!LEFT(ref.name, CHAR_LENGTH(ref.name) - LOCATE(' ', REVERSE(ref.name))) AS referrer_last_name!, 'hd.display_name|referrer_specialty', # required spelling !! 'rs.display_name|referring_facility_name', 'NULL|referrer_address', 'NULL|referrer_city', 'NULL|referrer_district', 'NULL|referrer_county', 'NULL|referrer_postcode', 'NULL|referrer_country', 'rs.organisation_code|referring_facility_ods_code', 'NULLIF( pc.unit_number, "UNKNOWN")|unit_number', # NULL if a = b 'NULL|other_unit_number', 'IFNULL(rrv.clinical_details, "DiagnosticTest")|referral_reason', # a or b q!"Routine"|test_priority!, 'NULL|test_id', # NGIS test id, will depend on diagnosis (AML vs ALL) 'lt.field_label|test_name', # needs to match test directory 'NULL|ngis_id', # NGIS order number 'rer.external_reference|external_sample_id', 'NULL|external_patient_id', 'NULL|sample_type_received', # set later, same as sample_type_dispatched 'rsd.specimen_date|datetime_sample_collected', # datetime 'r.created_at|datetime_sample_received', # datetime 'r.created_at|datetime_order_received', # datetime 'NULL|datetime_processing_complete', # set later 'rsd.specimen_quality|sample_condition', # not available at upload q!MAX( CASE WHEN ao.option_name = 'doi' THEN 'yes' ELSE 'no' END ) AS infection_risk!, 'NULL|sample_transfer_request_comments', q!MAX( CASE WHEN lt2.test_name = 'sample_type' THEN tr.result END ) AS sample_type_dispatched!, # WGS samples are whole (BM or PB) so will never have DNA vol, conc, etc 'IF( rst.volume, "ul", NULL )|sample_units', 'rst.volume|sample_amount', q!IF( rst.concentration < 9999, rst.concentration, NULL ) AS dna_concentration!, 'NULL|sample_date_sent', 'rst.vialId|sample_tube_id', 'sr.plateId|sample_rack_id', 'NULL|sample_courier', 'NULL|sample_courier_ref', 'NULL|expected_sample_number', 'NULL|order_submitted', 'NULL|primary_sample', # fields used for data processing only (underscored for omission in @headers): 'lt.test_name|_test_name', 'ls.section_name|_section_name', 'rrs.results_summary|_results_summary', # to test for germline sample, superfluous if using sample_type_dispatched: 'GROUP_CONCAT(DISTINCT(s.description))|_specimen_type', ); return wantarray ? @cols : \@cols; }