package LIMS::Local::ExcelHandler;
# ParseExcel_XLHTML speeds up Spreadsheet::ParseExcel; requires xlhtml package (apt-get install xlhtml)
# IO::All & .csv file *much* faster than ParseExcel_XLHTML
# Text::CSV::Simple - about 50% of speed of IO::All->getline, but handles native .csv files better
use strict;
use warnings;
use File::stat;
use Data::Dumper;
use Text::CSV::Simple;
use LIMS::Local::Config;
use Spreadsheet::ParseExcel::Simple;
# add a more explicit warning for Text::CSV::Simple errors:
Text::CSV::Simple->add_trigger(on_failure => sub {
my ($self, $csv) = @_;
if ( my $err = $csv->error_input ) {
warn qq!Text::CSV::Simple parser failed on: "$err"\n!;
# warn Dumper [$csv->error_diag]; # call error_diag() in list context
}
});
# use Time::HiRes qw(gettimeofday tv_interval); my $t0 = [gettimeofday];
# use Spreadsheet::ParseExcel_XLHTML; # use Spreadsheet::ParseExcel::Simple; # too slow
my $cfg = LIMS::Local::Config->instance;
my $path_to_app_root = $cfg->{path_to_app_root}
|| die "Couldn't find path to application directory"; # warn $path_to_app_root;
my ($specialities, $locales); # for lookup hashref tables, used if lookup = clinician
#------------------------------------------------------------------------------
sub new {
my $class = shift;
my $self = {};
$self->{WORKBOOK} = undef;
$self->{LOCATION_TYPE} = undef;
bless ($self, $class);
return $self;
}
#------------------------------------------------------------------------------
# setter: receives target type (hospital, clinician, etc) as arg, sets target xls file for target type:
# getter & setter: returns xls filename
sub source {
my $self = shift;
if (@_) {
$self->{LOCATION_TYPE} = shift;
$self->{WORKBOOK} = $self->_get_file('filename')
|| die "Cannot find target xls file for location type: $self->{LOCATION_TYPE}";
}
else {
return $self->{WORKBOOK};
}
}
#------------------------------------------------------------------------------
sub filedata {
my $self = shift;
my $src_file = $self->source;
my $path_to_file = $path_to_app_root . '/src/data/' . $src_file . '.csv'; # warn $path_to_file;
my $filedate = DateTime->from_epoch( epoch => stat($path_to_file)->mtime );
my $age_in_months = DateTime->now->subtract_datetime($filedate)->delta_months;
return { filename => $src_file, age => $age_in_months };
}
#------------------------------------------------------------------------------
# parse source file for $terms:
sub parse {
my $self = shift;
my $term = shift;
# my $matches = $self->_parse_excel($term); # very slow
# my $matches = $self->_parse_excel_xlhtml($term); # faster than ParseExcel
my $matches = $self->_parse_csv_file($term); # massively faster even than ParseExcel_XLHTML
# return ref to array of hashrefs:
return $matches;
}
#------------------------------------------------------------------------------
sub parse_xl_file {
my ($self, $xl_file) = @_; # warn $xl_file;
# use Spreadsheet::ParseExcel_XLHTML qw/-install/; # speeds up 4x
-e $xl_file or die "file does not exist: $xl_file";
my $xls = Spreadsheet::ParseExcel::Simple->read($xl_file) or die $!;
my @worksheets = $xls->sheets;
my $worksheet = $worksheets[0];
my @data = (); # create array of row arrayrefs:
while ($worksheet->has_data) {
push @data, [ $worksheet->next_row ]; # next_row() = array mode
}
return \@data;
}
#------------------------------------------------------------------------------
sub fetch_all {
my $self = shift;
my $data = $self->_read_file;
return $data;
}
#------------------------------------------------------------------------------
sub fetch_cols {
my $self = shift;
my $args = shift;
my $fields = $args->{fields}; # cols to return data from
my $select = $args->{select}; # which field to select on eg 'practice_code'
my $value = $args->{value}; # value of field eg 'B12345'
my $xls_file = $self->source;
my $src_file = $path_to_app_root . '/src/data/' . $xls_file; # warn $src_file;
my $parser = Text::CSV::Simple->new();
# get field definitions arrayref (eg code name null addr1, etc):
my %index = do {
my $fields = $self->_get_file('fields'); # warn Dumper $fields;
map{ $fields->[$_], $_ } (0 .. scalar @$fields - 1); # array to hash
}; # warn Dumper \%index;
# get array positions for data fields:
my @required = @index{@$fields}; # warn Dumper \@required;
$parser->want_fields(@required);
# get array position for select field:
my $select_pos = $index{$select}; # warn Dumper $select_pos;
# die OK here: "Each time we call a trigger we wrap it in an eval block.
# If the eval block catches an error we simply call 'next' on the loop."
$parser->add_trigger( after_parse => sub {
my ($self, $data) = @_; # warn $data->[14];
die unless $data->[$select_pos] eq $value;
});
my @data = $parser->read_file($src_file . '.csv'); # warn Dumper \@data;
return \@data;
}
#------------------------------------------------------------------------------
# uses Text::CSV::Simple - faster than ParseExcel, but slower than direct read
# (IO::All->getline); but speed boost using add_trigger() in _read_file()
sub _parse_csv_file {
my $self = shift;
my $term = shift;
my $src_file = $self->source;
my $data = $self->_read_file($term);
my $fields = $self->_get_file('fields');
if ($src_file eq 'econcur') {
$specialities = $self->_get_specialities;
$locales = $self->_get_locales;
}
my @matches; # for results
ROW: foreach my $row (@$data) { # warn Dumper $row; next;
# next ROW unless grep { $row->{$_} =~ /$term/i } keys %$row; - using add_trigger() now
if ($src_file eq 'econcur') {
$self->_manipulate_clinician_data($row);
}
my $display = join ', ', map $row->{$_},
grep $row->{$_}, # if field contains entry
grep $_ !~ /^(code|null)$/, # skip code field & nulls
@$fields;
push @matches, {
code => $row->{code},
display => $display,
};
} # timings($t0);
return \@matches;
}
#------------------------------------------------------------------------------
=begin # fastest method - reads csv file directly:
sub parse_csv_file {
my $self = shift;
my $term = shift;
my $xls_file = $self->source;
my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file;
my $io = io($path_to_file . '.csv');
my @matches; # for results
# get field definitions hashref (eg CODE => 0, NAME => 1, etc):
my $FIELD = $self->_get_file('fields');
# get array of required field positions (eg 0, 1, 4, 5, etc):
my @field_positions = grep $_, @{$FIELD}{ qw(NAME CODE2 ADDR1 ADDR2 ADDR3 ADDR4 PCODE) };
use Time::HiRes qw(gettimeofday tv_interval);
my $t0 = [gettimeofday];
{ # localise to switch off warnings:
no warnings 'uninitialized'; # for regex on potentially empty fields
# timings($t0);
ROW : while ( my $line = $io->getline ) { # TODO: doesn't work if row contains comma within field
# split line on comma:
my @cells = split ',', $line; # warn Dumper @cells; next;
# get national code, or skip line:
my $code_cell = $cells[$FIELD->{CODE}] || next ROW; # warn Dumper $code_cell; next;
# get rest of required fields (if defined) into array:
my @data_cells = grep $_, @cells[@field_positions]; # warn Dumper \@data_cells; next;
# look for matches in all collected fields, or skip line:
next ROW unless grep { $_ =~ /$term/i } (@data_cells, $code_cell);
push @matches, {
code => $code_cell,
display => join ', ', @data_cells,
};
}
timings($t0);
}
return \@matches;
}
=cut
#------------------------------------------------------------------------------
=begin # use the Spreadsheet::ParseExcel_XLHTML method - several times faster than Spreadsheet::ParseExcel:
sub _parse_excel_xhtml {
my $self = shift;
my $term = shift;
my $xls_file = $self->source;
my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file;
my $excel = new Spreadsheet::ParseExcel_XLHTML;
my $workbook = $excel->Parse($path_to_file . '.xls');
my $worksheet = $workbook->Worksheet(0);
my ( $row_min, $row_max ) = $worksheet->row_range();
my @matches; # results array
my $FIELD_FOR = $self->_get_file('fields'); # hashref
ROW : for my $row ( $row_min .. $row_max ) {
my $code_cell = $worksheet->get_cell( $row, $FIELD_FOR->{CODE} );
next ROW unless $code_cell; # next ROW unless length $code > 3; need parent code for appropriate matching
my @location_cells;
CELL : for my $col ( @{$FIELD_FOR}{ qw(NAME CODE2 ADDR1 ADDR2 ADDR3 ADDR4 PCODE) } ) {
my $cell = $worksheet->get_cell( $row, $col );
next CELL unless $cell;
push @location_cells, $cell;
}
next ROW unless grep $_->value =~ /$term/i, (@location_cells, $code_cell);
push @matches, {
code => $code_cell->value,
display => join ', ', map { $_->value } @location_cells,
};
}
return \@matches;
}
=cut
#------------------------------------------------------------------------------
=begin # uses Spreadsheet::ParseExcel::Simple - much slower than Spreadsheet::ParseExcel_XLHTML
sub _parse_excel {
my $self = shift;
my $term = shift;
my $xls_file = $self->source_file;
my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file;
my $xls = Spreadsheet::ParseExcel::Simple->read($path_to_file . '.xls');
my @worksheets = $xls->sheets;
my $worksheet = $worksheets[0];
my $FIELD_FOR = $self->_get_file('fields');
my @matches; # results array
while ($worksheet->has_data) {
my @data = $worksheet->next_row; # warn Dumper \@data;
# skip regional entries eg RR8, RAE, etc
next unless length $data[ $FIELD_FOR->{CODE} ] > 3; # warn Dumper $data[ $FIELD_FOR->{CODE} ];
next unless grep $data[ $FIELD_FOR->{$_} ] =~ /$term/i, qw(NAME ADDR CITY);
# warn Dumper join ', ', @data[ @$FIELD_FOR{ qw(NAME ADDR CITY PCODE) } ];
push @matches, {
code => $data[ $FIELD_FOR->{CODE} ],
display => join ', ', @data[ @$FIELD_FOR{ qw(NAME ADDR1 CITY PCODE) } ],
};
}
# return ref to array of hashrefs:
return \@matches;
}
=cut
#------------------------------------------------------------------------------
sub _get_file {
my $self = shift;
my $attr = shift || die 'attribute arg required';
my $location_type = $self->{LOCATION_TYPE}
or die 'Cannot get location type'; # warn 'LOCATION TYPE:'.$location_type;
my %xls_file_for = ( # NB: "Fields named 'null' vanish into the ether".
hospital => {
filename => 'etrust',
fields => [ qw( code name null null addr1 null null addr4 null post_code ) ],
},
prison => {
filename => 'eprison',
fields => [ qw( code name null null addr1 addr2 addr3 addr4 null post_code ) ],
},
GP => {
filename => 'egpcur',
fields => [ qw( code name null null addr1 addr2 addr3 addr4 null post_code null
null null null practice_code) ],
},
practice => {
filename => 'epraccur',
fields => [ qw( code name null null addr1 addr2 addr3 addr4 null post_code ) ],
},
independant => {
filename => 'ephpsite',
fields => [ qw( code name null null addr1 addr2 addr3 addr4 null post_code ) ],
},
clinician => {
filename => 'econcur',
fields => [ qw( null code name init sex speciality_code null org_code ) ],
},
);
return $xls_file_for{$location_type}{$attr};
}
#------------------------------------------------------------------------------
sub _read_file {
my $self = shift;
my $term = shift; # optional term to search for
my $xls_file = $self->source;
my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file;
# get field definitions arrayref (eg code name null addr1, etc):
my $fields = $self->_get_file('fields'); # warn Dumper $fields;
my $parser = Text::CSV::Simple->new();
$parser->field_map(@$fields);
if ($term) { # warn Dumper $term;
# get array positions of @$fields non-null vals:
my @indexes = grep { $fields->[$_] ne 'null' } (0 .. @$fields -1); # warn @indexes;
# this is faster than collecting all rows & discarding all not matching $term
$parser->add_trigger( after_parse => sub {
my ($self, $data) = @_; # warn Dumper $data;
die unless grep $data->[$_] =~ /$term/i, @indexes; # 'die' is OK - see docs
});
}
my @data = $parser->read_file($path_to_file . '.csv'); # timings($t0);
return \@data;
}
#------------------------------------------------------------------------------
sub _get_locales {
my $self = shift;
my $file = $path_to_app_root . '/src/data/etr.csv';
my $parser = Text::CSV::Simple->new();
$parser->want_fields(0,1);
my @data = $parser->read_file($file);
my %locales;
foreach my $row (@data) { # warn Dumper $row; next;
my ($code,$name) = @$row;
$locales{$code} = $name;
}
return \%locales;
}
#------------------------------------------------------------------------------
sub _get_specialities {
my $self = shift;
my $file = $path_to_app_root . '/src/data/specialities.csv';
my $parser = Text::CSV::Simple->new();
$parser->want_fields(0,1);
my @data = $parser->read_file($file);
my %specialities;
foreach my $row (@data) { # warn Dumper $row; next;
my ($code,$name) = @$row;
$specialities{$code} = uc $name;
}
return \%specialities;
}
#------------------------------------------------------------------------------
sub _manipulate_clinician_data {
my $self = shift;
my $row = shift;
my $id = sprintf '%s %s [%s]', @{$row}{qw(name init sex)};
map delete $row->{$_}, qw(name init sex);
$row->{name} = $id;
my $org_code = $row->{org_code};
$row->{org_code} = $locales->{$org_code} || "UNKNOWN LOCATION [$org_code]";
my $speciality_code = $row->{speciality_code};
$row->{speciality_code} = $specialities->{$speciality_code} || "UNKNOWN SPECIALITY [$speciality_code]";
}
#------------------------------------------------------------------------------
sub timings {
# my $t0 = shift;
# warn sprintf "%.4f sec\n", tv_interval $t0, [gettimeofday];
}
1;