package LIMS::Local::ExcelHandler; # ParseExcel_XLHTML speeds up Spreadsheet::ParseExcel; requires xlhtml package (apt-get install xlhtml) # IO::All & .csv file *much* faster than ParseExcel_XLHTML # Text::CSV::Simple - about 50% of speed of IO::All->getline, but handles native .csv files better # Text::CSV::Slurp in conjunction with Text::CSV_XL >0.80 nearly same speed as IO::All # now using Text::CSV::Slurp::_from_handle() v0.901 patch (mst) direct in _text_csv_slurp() use strict; use warnings; use IO::File; use Text::CSV; use File::stat; use Data::Dumper; use Text::CSV_XS 0.88; # faster csv parsing use LIMS::Local::Config; use Spreadsheet::WriteExcel; use Spreadsheet::ParseExcel::Simple; # use Text::CSV::Slurp 0.901; # using v0.901 patch direct now in _text_csv_slurp() # use Text::CSV::Simple; replaced with T::C::Slurp - faster & avoids lousy on_failure error # use Spreadsheet::ParseExcel_XLHTML; # use Spreadsheet::ParseExcel::Simple; # too slow =begin # add a more explicit warning for Text::CSV::Simple errors: Text::CSV::Simple->add_trigger(on_failure => sub { my ($self, $csv) = @_; if ( my $err = $csv->error_input ) { warn qq!Text::CSV::Simple parser failed on: "$err"\n!; # warn Dumper [$csv->error_diag]; # call error_diag() in list context } }); =cut # use Time::HiRes qw(gettimeofday tv_interval); my $t0 = [gettimeofday]; my $cfg = LIMS::Local::Config->instance; my $path_to_app_root = $cfg->{path_to_app_root} || die "Couldn't find path to application directory"; # warn $path_to_app_root; my ($specialities, $locales); # for lookup hashref tables, used if lookup = clinician #------------------------------------------------------------------------------ sub new { my $class = shift; my $self = {}; $self->{WORKBOOK} = undef; $self->{LOCATION_TYPE} = undef; $self->{FILEHANDLE} = undef; # for write_excel_file() bless ($self, $class); return $self; } #------------------------------------------------------------------------------ # setter: receives target type (hospital, clinician, etc) as arg, sets target xls file for target type: # getter & setter: returns xls filename sub source { my $self = shift; if (@_) { $self->{LOCATION_TYPE} = shift; # warn $self->{LOCATION_TYPE}; $self->{WORKBOOK} = $self->_get_file('filename') || die "Cannot find target xls file for location type: $self->{LOCATION_TYPE}"; } else { return $self->{WORKBOOK}; # filename from yml file } } #------------------------------------------------------------------------------ sub filedata { my $self = shift; my $src_file = $self->source; my $path_to_file = $path_to_app_root . '/src/data/' . $src_file . '.csv'; # warn $path_to_file; my $filedate = DateTime->from_epoch( epoch => stat($path_to_file)->mtime ); my $age_in_months = DateTime->now->subtract_datetime($filedate)->delta_months; return { filename => $src_file, age => $age_in_months }; } #------------------------------------------------------------------------------ # parse source file for $terms: sub parse { my $self = shift; my $term = shift; # my $matches = $self->_parse_excel($term); # very slow # my $matches = $self->_parse_excel_xlhtml($term); # faster than ParseExcel my $matches = $self->_parse_csv_file($term); # massively faster even than ParseExcel_XLHTML # return ref to array of hashrefs: return $matches; } #------------------------------------------------------------------------------ sub parse_xl_file { my ($self, $xl_file, $args) = @_; # warn $xl_file; # args is optional my $sheet_no = $args->{sheet} || 0; # optional - can pass required worksheet no # use Spreadsheet::ParseExcel_XLHTML qw/-install/; # speeds up 4x -e $xl_file or die "file does not exist: $xl_file"; my $xls = Spreadsheet::ParseExcel::Simple->read($xl_file) or die $!; my @worksheets = $xls->sheets; my $worksheet = $worksheets[$sheet_no]; # defaults to 0 unless specified my @data = (); # create array of row arrayrefs: while ($worksheet->has_data) { push @data, [ $worksheet->next_row ]; # next_row() = array mode } return \@data; } #------------------------------------------------------------------------------- sub initialise_fh { # for write_excel_file() my $self = shift; if (@_) { my $str = shift; # reference to an empty container open my $fh, '>', $str or die "failed to open filehandle: $!"; $self->{FILEHANDLE} = $fh; # warn $self->{FILEHANDLE}; } else { return $self->{FILEHANDLE}; } } #------------------------------------------------------------------------------- sub write_excel_file { my $self = shift; my $data = shift; # arrayref of hrefs (data, headers & name) my $fh = $self->initialise_fh(); my $workbook = Spreadsheet::WriteExcel->new($fh); for my $sheet (@$data) { my $name = $sheet->{name}; # optional - defaults to 'Sheet' my $data = $sheet->{data}; my $headers = $sheet->{headers}; my $columns = $sheet->{columns}; # optional hashref of col widths my $opt_format = $sheet->{format}; # optional hashref - forces cell width my $row = 0; # row counter my $worksheet = $workbook->add_worksheet($name); { # write headers: my $format = $workbook->add_format(); $format->set_bold(); $format->set_font("Courier New"); $format->set_align('center'); $format->set_align('vcenter'); $format->set_text_wrap() if ( grep $_ eq 'text_wrap', @$opt_format ); # format columns if required: $worksheet->set_column($_, $_, $columns->{$_}) for keys %$columns; $worksheet->write( $row++, 0, $headers, $format ); } { # write rows: my $format = $workbook->add_format(); $format->set_text_wrap() if ( grep $_ eq 'text_wrap', @$opt_format ); $worksheet->write( $row++, 0, $_, $format ) for @$data; # $_ is arrayref } } $workbook->close(); } #------------------------------------------------------------------------------ sub fetch_all { my $self = shift; my $data = $self->_read_file; return $data; } #------------------------------------------------------------------------------ sub fetch_cols { my $self = shift; my $args = shift; # warn Dumper $args; my $fields = $args->{fields}; # cols to return data from my $select = $args->{select}; # which field to select on eg 'practice_code' my $value = $args->{value}; # value of field eg 'B12345' my $file = $self->source; my $src_file = $path_to_app_root . '/src/data/' . $file . '.csv'; # warn $src_file; my $cols = $self->_get_file('fields'); # warn Dumper $fields; my %args = (file => $src_file, field_order => $cols); my $rows = $self->_text_csv_slurp(\%args); # retrieve required cols for rows where $select field matches $value: no warnings 'uninitialized'; # eg undefined cols in src file my @data = map { [ @{$_}{@$fields} ] } grep { $_->{$select} eq $value } @$rows; # warn Dumper \@data; return \@data; } #------------------------------------------------------------------------------- # this is Text::CSV::Slurp::_from_handle() v0.901 patch (allows field_order): sub _text_csv_slurp { my $self = shift; my $args = shift; # my $io = io($args->{file}); # times out on new Deb6 ??why my $io = new IO::File; # ok on new Deb6 open( $io, '<', $args->{file} ) || return $self->error($!); my $order = $args->{field_order}; my $opt = { binary => 1 }; # recommended to set true my $csv = Text::CSV->new($opt); if ( $order ) { $csv->column_names($order); } elsif ( my $head = $csv->getline($io) ) { $csv->column_names( $head ); } else { return $self->error( $csv->error_diag() ); } my @results; while ( my $ref = $csv->getline_hr($io) ) { push @results, $ref; } return \@results; } #------------------------------------------------------------------------------ sub _parse_csv_file { my $self = shift; my $term = shift; my $src_file = $self->source; # warn $src_file; my $data = $self->_read_file($term); my $fields = $self->_get_file('fields'); if ($src_file eq 'econcur') { $specialities = $self->_get_specialities; $locales = $self->_get_locales; } my @matches; # for results ROW: foreach my $row (@$data) { # warn Dumper $row; # next; # next ROW unless grep { $row->{$_} =~ /$term/i } keys %$row; - using add_trigger() now if ($src_file eq 'econcur') { $self->_manipulate_clinician_data($row); } my $display = join ', ', map $row->{$_}, grep $row->{$_}, # if field contains entry grep $_ !~ /^(code|null)$/, # skip code field & nulls @$fields; push @matches, { code => $row->{code}, display => $display, }; } # timings($t0); return \@matches; } #------------------------------------------------------------------------------ =begin # fastest method - reads csv file directly: sub parse_csv_file { my $self = shift; my $term = shift; my $xls_file = $self->source; my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file; my $io = io($path_to_file . '.csv'); my @matches; # for results # get field definitions hashref (eg CODE => 0, NAME => 1, etc): my $FIELD = $self->_get_file('fields'); # get array of required field positions (eg 0, 1, 4, 5, etc): my @field_positions = grep $_, @{$FIELD}{ qw(NAME CODE2 ADDR1 ADDR2 ADDR3 ADDR4 PCODE) }; use Time::HiRes qw(gettimeofday tv_interval); my $t0 = [gettimeofday]; { # localise to switch off warnings: no warnings 'uninitialized'; # for regex on potentially empty fields # timings($t0); ROW : while ( my $line = $io->getline ) { # TODO: doesn't work if row contains comma within field # split line on comma: my @cells = split ',', $line; # warn Dumper @cells; next; # get national code, or skip line: my $code_cell = $cells[$FIELD->{CODE}] || next ROW; # warn Dumper $code_cell; next; # get rest of required fields (if defined) into array: my @data_cells = grep $_, @cells[@field_positions]; # warn Dumper \@data_cells; next; # look for matches in all collected fields, or skip line: next ROW unless grep { $_ =~ /$term/i } (@data_cells, $code_cell); push @matches, { code => $code_cell, display => join ', ', @data_cells, }; } timings($t0); } return \@matches; } =cut #------------------------------------------------------------------------------ =begin # use the Spreadsheet::ParseExcel_XLHTML method - several times faster than Spreadsheet::ParseExcel: sub _parse_excel_xhtml { my $self = shift; my $term = shift; my $xls_file = $self->source; my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file; my $excel = new Spreadsheet::ParseExcel_XLHTML; my $workbook = $excel->Parse($path_to_file . '.xls'); my $worksheet = $workbook->Worksheet(0); my ( $row_min, $row_max ) = $worksheet->row_range(); my @matches; # results array my $FIELD_FOR = $self->_get_file('fields'); # hashref ROW : for my $row ( $row_min .. $row_max ) { my $code_cell = $worksheet->get_cell( $row, $FIELD_FOR->{CODE} ); next ROW unless $code_cell; # next ROW unless length $code > 3; need parent code for appropriate matching my @location_cells; CELL : for my $col ( @{$FIELD_FOR}{ qw(NAME CODE2 ADDR1 ADDR2 ADDR3 ADDR4 PCODE) } ) { my $cell = $worksheet->get_cell( $row, $col ); next CELL unless $cell; push @location_cells, $cell; } next ROW unless grep $_->value =~ /$term/i, (@location_cells, $code_cell); push @matches, { code => $code_cell->value, display => join ', ', map { $_->value } @location_cells, }; } return \@matches; } =cut #------------------------------------------------------------------------------ =begin # uses Spreadsheet::ParseExcel::Simple - much slower than Spreadsheet::ParseExcel_XLHTML sub _parse_excel { my $self = shift; my $term = shift; my $xls_file = $self->source_file; my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file; my $xls = Spreadsheet::ParseExcel::Simple->read($path_to_file . '.xls'); my @worksheets = $xls->sheets; my $worksheet = $worksheets[0]; my $FIELD_FOR = $self->_get_file('fields'); my @matches; # results array while ($worksheet->has_data) { my @data = $worksheet->next_row; # warn Dumper \@data; # skip regional entries eg RR8, RAE, etc next unless length $data[ $FIELD_FOR->{CODE} ] > 3; # warn Dumper $data[ $FIELD_FOR->{CODE} ]; next unless grep $data[ $FIELD_FOR->{$_} ] =~ /$term/i, qw(NAME ADDR CITY); # warn Dumper join ', ', @data[ @$FIELD_FOR{ qw(NAME ADDR CITY PCODE) } ]; push @matches, { code => $data[ $FIELD_FOR->{CODE} ], display => join ', ', @data[ @$FIELD_FOR{ qw(NAME ADDR1 CITY PCODE) } ], }; } # return ref to array of hashrefs: return \@matches; } =cut #------------------------------------------------------------------------------ sub _get_file { my $self = shift; my $attr = shift || die 'attribute arg required'; my $location_type = $self->{LOCATION_TYPE} or die 'Cannot get location type'; # warn 'LOCATION TYPE:'.$location_type; my %args = ( yaml_dir => $cfg->{settings}->{yaml_dir}, app_root => $path_to_app_root, filename => 'cfh_data_files', ); my $cfh_data_file = LIMS::Local::Utils::get_yaml(\%args); # warn Dumper $yaml; return $cfh_data_file->{$location_type}->{$attr}; } #------------------------------------------------------------------------------ # uses Text::CSV::Simple - faster than ParseExcel, but slower than direct read # (IO::All->getline); but speed boost using add_trigger() in _read_file() # now uses Text::CSV::Slurp - faster than Text::CSV::Simple & better error handling sub _read_file { my $self = shift; my $term = shift; # optional term to search for my $file = $self->source; my $src_file = $path_to_app_root . '/src/data/' . $file . '.csv'; # get field definitions arrayref (eg code name null addr1, etc): my $fields = $self->_get_file('fields'); # warn Dumper $fields; my $unique = LIMS::Local::Utils::get_unique_elements($fields); =begin # replaced with Text::CSV::Slurp my $parser = Text::CSV::Simple->new(); $parser->field_map(@$fields); if ($term) { # warn Dumper $term; # get array positions of @$fields non-null vals: my @indexes = grep { $fields->[$_] ne 'null' } (0 .. @$fields -1); # warn @indexes; # this is faster than collecting all rows & discarding all not matching $term $parser->add_trigger( after_parse => sub { my ($self, $data) = @_; # warn Dumper $data; die unless grep $data->[$_] =~ /$term/i, @indexes; # 'die' is OK - see docs }); } my @data = $parser->read_file($path_to_file . '.csv'); # timings($t0); =cut my %args = (file => $src_file, field_order => $fields); my $rows = $self->_text_csv_slurp(\%args); if (! $term) { return $rows } # return immediately if nothing to seach for no warnings 'uninitialized'; # possibly empty cells my @data = grep { grep /$term/i, @{$_}{@$unique} } @$rows; # warn Dumper \@data; return \@data; } #------------------------------------------------------------------------------ sub _get_locales { my $self = shift; my $file = $path_to_app_root . '/src/data/etr.csv'; my @fields = qw(code organisation); my %args = (file => $file, field_order => \@fields); my $data = $self->_text_csv_slurp(\%args); my %locales = map { $_->{code} => uc $_->{organisation} } @$data; return \%locales; } #------------------------------------------------------------------------------ sub _get_specialities { my $self = shift; my $file = $path_to_app_root . '/src/data/specialities.csv'; my @fields = qw(code speciality); my %args = (file => $file, field_order => \@fields); my $data = $self->_text_csv_slurp(\%args); my %specialities = map { $_->{code} => uc $_->{speciality} } @$data; return \%specialities; } #------------------------------------------------------------------------------ sub _manipulate_clinician_data { my $self = shift; my $row = shift; my $id = sprintf '%s %s [%s]', @{$row}{qw(name init sex)}; map delete $row->{$_}, qw(name init sex); $row->{name} = $id; my $org_code = $row->{org_code}; $row->{org_code} = $locales->{$org_code} || "UNKNOWN LOCATION [$org_code]"; my $speciality_code = $row->{speciality_code}; $row->{speciality_code} = $specialities->{$speciality_code} || "UNKNOWN SPECIALITY [$speciality_code]"; } #------------------------------------------------------------------------------ sub timings { # my $t0 = shift; # warn sprintf "%.4f sec\n", tv_interval $t0, [gettimeofday]; } 1;