RSS Git Download  Clone
Raw Blame History
package LIMS::Local::ExcelHandler;

# ParseExcel_XLHTML speeds up Spreadsheet::ParseExcel; requires xlhtml package (apt-get install xlhtml)
# IO::All & .csv file *much* faster than ParseExcel_XLHTML
# Text::CSV::Simple - about 50% of speed of IO::All->getline, but handles native .csv files better
# Text::CSV::Slurp in conjunction with Text::CSV_XL >0.80 nearly same speed as IO::All
# now using Text::CSV::Slurp::_from_handle() v0.901 patch (mst) direct in _text_csv_slurp()

use strict;
use warnings;

use IO::File;
use Text::CSV;
use File::stat;
use Data::Dumper;
use Text::CSV_XS 0.88; # faster csv parsing
use LIMS::Local::Config;
use Spreadsheet::WriteExcel;
use Spreadsheet::ParseExcel::Simple;

# use Text::CSV::Slurp 0.901; # using v0.901 patch direct now in _text_csv_slurp()
# use Text::CSV::Simple; replaced with T::C::Slurp - faster & avoids lousy on_failure error
# use Spreadsheet::ParseExcel_XLHTML;
# use Spreadsheet::ParseExcel::Simple; # too slow

=begin # add a more explicit warning for Text::CSV::Simple errors:
Text::CSV::Simple->add_trigger(on_failure => sub { 
	my ($self, $csv) = @_;
    if ( my $err = $csv->error_input ) {
        warn qq!Text::CSV::Simple parser failed on: "$err"\n!;
        # warn Dumper [$csv->error_diag]; # call error_diag() in list context
    }
});
=cut

# use Time::HiRes qw(gettimeofday tv_interval); my $t0 = [gettimeofday];

my $cfg = LIMS::Local::Config->instance;
my $path_to_app_root = $cfg->{path_to_app_root}
    || die "Couldn't find path to application directory"; # warn $path_to_app_root;

my ($specialities, $locales); # for lookup hashref tables, used if lookup = clinician

#------------------------------------------------------------------------------
sub new {
    my $class = shift;
    my $self  = {};

    $self->{WORKBOOK}      = undef;
    $self->{LOCATION_TYPE} = undef;
	$self->{FILEHANDLE}    = undef; # for write_excel_file()
    bless ($self, $class);

    return $self;
}

#------------------------------------------------------------------------------
# setter: receives target type (hospital, clinician, etc) as arg, sets target xls file for target type:
# getter & setter: returns xls filename
sub source {
    my $self = shift;

    if (@_) {
        $self->{LOCATION_TYPE} = shift; # warn $self->{LOCATION_TYPE};

        $self->{WORKBOOK} = $self->_get_file('filename')
            || die "Cannot find target xls file for location type: $self->{LOCATION_TYPE}";
    }
    else {
        return $self->{WORKBOOK}; # filename from yml file
    }
}

#------------------------------------------------------------------------------
sub filedata {
	my $self = shift;

    my $src_file = $self->source;
    my $path_to_file = $path_to_app_root . '/src/data/' . $src_file . '.csv'; # warn $path_to_file;
	my $filedate = DateTime->from_epoch( epoch => stat($path_to_file)->mtime );
	my $age_in_months = DateTime->now->subtract_datetime($filedate)->delta_months;

	return { filename => $src_file, age => $age_in_months };
}

#------------------------------------------------------------------------------
# parse source file for $terms:
sub parse {
    my $self = shift;
    my $term = shift;

    # my $matches = $self->_parse_excel($term); # very slow
    # my $matches = $self->_parse_excel_xlhtml($term); # faster than ParseExcel
    my $matches = $self->_parse_csv_file($term); # massively faster even than ParseExcel_XLHTML

    # return ref to array of hashrefs:
    return $matches;
}

#------------------------------------------------------------------------------
sub parse_xl_file {
	my ($self, $xl_file, $args) = @_; # warn $xl_file; # args is optional
	
	my $sheet_no = $args->{sheet} || 0; # optional - can pass required worksheet no

    # use Spreadsheet::ParseExcel_XLHTML qw/-install/; # speeds up 4x
	-e $xl_file or die "file does not exist: $xl_file";

    my $xls = Spreadsheet::ParseExcel::Simple->read($xl_file) or die $!;

    my @worksheets = $xls->sheets;
    my $worksheet  = $worksheets[$sheet_no]; # defaults to 0 unless specified
	
	my @data = (); # create array of row arrayrefs:
    while ($worksheet->has_data) {
		push @data, [ $worksheet->next_row ]; # next_row() = array mode
    }
    return \@data;
}

#-------------------------------------------------------------------------------
sub initialise_fh { # for write_excel_file()
	my $self = shift;
	
	if (@_) {
		my $str = shift; # reference to an empty container
		open my $fh, '>', $str or die "failed to open filehandle: $!";
		$self->{FILEHANDLE} = $fh; # warn $self->{FILEHANDLE};
	}
	else {
		return $self->{FILEHANDLE};
	}
}

#-------------------------------------------------------------------------------
sub write_excel_file {
	my $self = shift;
	my $data = shift; # arrayref of hrefs (data, headers & name)

	my $fh = $self->initialise_fh();
	
	my $workbook = Spreadsheet::WriteExcel->new($fh);

	for my $sheet (@$data) {
		my $name       = $sheet->{name}; # optional - defaults to 'Sheet<n>'
		my $data       = $sheet->{data};
		my $headers    = $sheet->{headers};
		my $columns    = $sheet->{columns}; # optional hashref of col widths
		my $opt_format = $sheet->{format}; # optional hashref - forces cell width

		my $row = 0; # row counter

		my $worksheet = $workbook->add_worksheet($name);

		{ # write headers:
			my $format = $workbook->add_format();
			$format->set_bold();
			$format->set_font("Courier New");
			$format->set_align('center');
			$format->set_align('vcenter');
			$format->set_text_wrap() if ( grep $_ eq 'text_wrap', @$opt_format );

			# format columns if required:
			$worksheet->set_column($_, $_, $columns->{$_}) for keys %$columns;

			$worksheet->write( $row++, 0, $headers, $format );
		}
		{ # write rows:
			my $format = $workbook->add_format();
			$format->set_text_wrap() if ( grep $_ eq 'text_wrap', @$opt_format );
			
			$worksheet->write( $row++, 0, $_, $format ) for @$data; # $_ is arrayref
		}
	}
	
	$workbook->close();
}

#------------------------------------------------------------------------------
sub fetch_all {
	my $self = shift;

	my $data = $self->_read_file;
	return $data;
}

#------------------------------------------------------------------------------
sub fetch_cols {
    my $self = shift;
    my $args = shift; # warn Dumper $args;

    my $fields = $args->{fields}; # cols to return data from
    my $select = $args->{select}; # which field to select on eg 'practice_code'
    my $value  = $args->{value};  # value of field eg 'B12345'

    my $file = $self->source;
    my $src_file = $path_to_app_root . '/src/data/' . $file . '.csv'; # warn $src_file;

    my $cols = $self->_get_file('fields'); # warn Dumper $fields;
    my %args = (file => $src_file, field_order => $cols);
	my $rows = $self->_text_csv_slurp(\%args);

    # retrieve required cols for rows where $select field matches $value:
    no warnings 'uninitialized'; # eg undefined cols in src file
    my @data = map { [ @{$_}{@$fields} ] } 
        grep { $_->{$select} eq $value } @$rows; # warn Dumper \@data;
    return \@data;
}

#-------------------------------------------------------------------------------
# this is Text::CSV::Slurp::_from_handle() v0.901 patch (allows field_order):
sub _text_csv_slurp {
    my $self = shift;
    my $args = shift;
    
    # my $io = io($args->{file}); # times out on new Deb6 ??why
    my $io = new IO::File; # ok on new Deb6
    open( $io, '<', $args->{file} ) || return $self->error($!);

    my $order = $args->{field_order};
    my $opt   = { binary => 1 }; # recommended to set true
    
    my $csv = Text::CSV->new($opt);

    if ( $order ) {
        $csv->column_names($order);
    } 
    elsif ( my $head = $csv->getline($io) ) {
        $csv->column_names( $head );
    }
    else {
        return $self->error( $csv->error_diag() );
    }
    
    my @results;
    while ( my $ref = $csv->getline_hr($io) ) {
        push @results, $ref;
    }
    
    return \@results;
}

#------------------------------------------------------------------------------
sub _parse_csv_file {
    my $self = shift;
    my $term = shift;

	my $src_file = $self->source; # warn $src_file;
    my $data = $self->_read_file($term);
	my $fields = $self->_get_file('fields');

    if ($src_file eq 'econcur') {
        $specialities = $self->_get_specialities;
        $locales = $self->_get_locales;
    }

    my @matches; # for results

    ROW: foreach my $row (@$data) { # warn Dumper $row; # next;
        # next ROW unless grep { $row->{$_} =~ /$term/i } keys %$row; - using add_trigger() now
        if ($src_file eq 'econcur') {
            $self->_manipulate_clinician_data($row);
        }

        my $display = join ', ', map $row->{$_},
            grep $row->{$_}, # if field contains entry
                grep $_ !~ /^(code|null)$/, # skip code field & nulls
                    @$fields;

        push @matches, {
            code    => $row->{code},
            display => $display,
        };
    } # timings($t0);

    return \@matches;
}

#------------------------------------------------------------------------------
=begin # fastest method - reads csv file directly:
sub parse_csv_file {
    my $self = shift;
    my $term = shift;

    my $xls_file = $self->source;

    my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file;

    my $io = io($path_to_file . '.csv');

    my @matches; # for results
    # get field definitions hashref (eg CODE => 0, NAME => 1, etc):
    my $FIELD = $self->_get_file('fields');
    # get array of required field positions (eg 0, 1, 4, 5, etc):
    my @field_positions = grep $_, @{$FIELD}{ qw(NAME CODE2 ADDR1 ADDR2 ADDR3 ADDR4 PCODE) };

    use Time::HiRes qw(gettimeofday tv_interval);
    my $t0 = [gettimeofday];

    {    # localise to switch off warnings:
        no warnings 'uninitialized'; # for regex on potentially empty fields

#       timings($t0);
        ROW : while ( my $line = $io->getline ) { # TODO: doesn't work if row contains comma within field
            # split line on comma:
            my @cells = split ',', $line; # warn Dumper @cells; next;
            # get national code, or skip line:
            my $code_cell = $cells[$FIELD->{CODE}] || next ROW; # warn Dumper $code_cell; next;
            # get rest of required fields (if defined) into array:
            my @data_cells = grep $_, @cells[@field_positions]; # warn Dumper \@data_cells; next;

            # look for matches in all collected fields, or skip line:
            next ROW unless grep { $_ =~ /$term/i } (@data_cells, $code_cell);

            push @matches, {
                code    => $code_cell,
                display => join ', ', @data_cells,
            };
        }
       timings($t0);
    }

    return \@matches;
}
=cut

#------------------------------------------------------------------------------
=begin # use the Spreadsheet::ParseExcel_XLHTML method - several times faster than Spreadsheet::ParseExcel:
sub _parse_excel_xhtml {
    my $self = shift;
    my $term = shift;

    my $xls_file = $self->source;

    my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file;

    my $excel = new Spreadsheet::ParseExcel_XLHTML;

    my $workbook  = $excel->Parse($path_to_file . '.xls');
    my $worksheet = $workbook->Worksheet(0);

    my ( $row_min, $row_max ) = $worksheet->row_range();

    my @matches; # results array
    my $FIELD_FOR = $self->_get_file('fields'); # hashref

    ROW : for my $row ( $row_min .. $row_max ) {
        my $code_cell = $worksheet->get_cell( $row, $FIELD_FOR->{CODE} );
        next ROW unless $code_cell; # next ROW unless length $code > 3; need parent code for appropriate matching

        my @location_cells;
        CELL : for my $col ( @{$FIELD_FOR}{ qw(NAME CODE2 ADDR1 ADDR2 ADDR3 ADDR4 PCODE) } ) {
            my $cell = $worksheet->get_cell( $row, $col );
            next CELL unless $cell;
            push @location_cells, $cell;
        }

        next ROW unless grep $_->value =~ /$term/i, (@location_cells, $code_cell);

        push @matches, {
            code    => $code_cell->value,
            display => join ', ', map { $_->value } @location_cells,
        };
    }

    return \@matches;
}
=cut

#------------------------------------------------------------------------------
=begin # uses Spreadsheet::ParseExcel::Simple - much slower than Spreadsheet::ParseExcel_XLHTML
sub _parse_excel {
    my $self = shift;
    my $term = shift;

    my $xls_file = $self->source_file;

    my $path_to_file = $path_to_app_root . '/src/data/' . $xls_file;

    my $xls = Spreadsheet::ParseExcel::Simple->read($path_to_file . '.xls');

    my @worksheets = $xls->sheets;
    my $worksheet  = $worksheets[0];

    my $FIELD_FOR = $self->_get_file('fields');

    my @matches; # results array

    while ($worksheet->has_data) {
        my @data = $worksheet->next_row; # warn Dumper \@data;

        # skip regional entries eg RR8, RAE, etc
        next unless length $data[ $FIELD_FOR->{CODE} ] > 3; # warn Dumper $data[ $FIELD_FOR->{CODE} ];

        next unless grep $data[ $FIELD_FOR->{$_} ] =~ /$term/i, qw(NAME ADDR CITY);
            # warn Dumper join ', ', @data[ @$FIELD_FOR{ qw(NAME ADDR CITY PCODE) } ];

        push @matches, {
            code    => $data[ $FIELD_FOR->{CODE} ],
            display => join ', ', @data[ @$FIELD_FOR{ qw(NAME ADDR1 CITY PCODE) } ],
        };
    }

    # return ref to array of hashrefs:
    return \@matches;
}
=cut

#------------------------------------------------------------------------------
sub _get_file {
    my $self = shift;
    my $attr = shift || die 'attribute arg required';

    my $location_type = $self->{LOCATION_TYPE}
        or die 'Cannot get location type'; # warn 'LOCATION TYPE:'.$location_type;

    my %args = (
        yaml_dir => $cfg->{settings}->{yaml_dir},
        app_root => $path_to_app_root,
        filename => 'cfh_data_files',
    );
    my $cfh_data_file = LIMS::Local::Utils::get_yaml(\%args); # warn Dumper $yaml;

    return $cfh_data_file->{$location_type}->{$attr};
}

#------------------------------------------------------------------------------
# uses Text::CSV::Simple - faster than ParseExcel, but slower than direct read
# (IO::All->getline); but speed boost using add_trigger() in _read_file()
# now uses Text::CSV::Slurp - faster than Text::CSV::Simple & better error handling
sub _read_file {
	my $self = shift;
    my $term = shift; # optional term to search for 

    my $file = $self->source;
    my $src_file = $path_to_app_root . '/src/data/' . $file . '.csv';

    # get field definitions arrayref (eg code name null addr1, etc):
    my $fields = $self->_get_file('fields'); # warn Dumper $fields; 
 	my $unique = LIMS::Local::Utils::get_unique_elements($fields);

=begin # replaced with Text::CSV::Slurp
    my $parser = Text::CSV::Simple->new();
    $parser->field_map(@$fields);
	
    if ($term) { # warn Dumper $term;
        # get array positions of @$fields non-null vals:
        my @indexes = grep { $fields->[$_] ne 'null' } (0 .. @$fields -1); # warn @indexes;
        # this is faster than collecting all rows & discarding all not matching $term
        $parser->add_trigger( after_parse => sub {
            my ($self, $data) = @_; # warn Dumper $data;
            die unless grep $data->[$_] =~ /$term/i, @indexes; # 'die' is OK - see docs
          });
    }
    my @data = $parser->read_file($path_to_file . '.csv'); # timings($t0);
=cut
    my %args = (file => $src_file, field_order => $fields);
    my $rows = $self->_text_csv_slurp(\%args);
    if (! $term) { return $rows } # return immediately if nothing to seach for

    no warnings 'uninitialized'; # possibly empty cells
    my @data = grep { grep /$term/i, @{$_}{@$unique} } @$rows; # warn Dumper \@data;
	return \@data;
}

#------------------------------------------------------------------------------
sub _get_locales {
    my $self = shift;

    my $file = $path_to_app_root . '/src/data/etr.csv';

    my @fields = qw(code organisation);
    
    my %args = (file => $file, field_order => \@fields);
    my $data = $self->_text_csv_slurp(\%args);
    
    my %locales = map { $_->{code} => uc $_->{organisation} } @$data;
    return \%locales;
}

#------------------------------------------------------------------------------
sub _get_specialities {
    my $self = shift;

    my $file = $path_to_app_root . '/src/data/specialities.csv';

    my @fields = qw(code speciality);
    my %args = (file => $file, field_order => \@fields);
    my $data = $self->_text_csv_slurp(\%args);

    my %specialities = map { $_->{code} => uc $_->{speciality} } @$data;
    return \%specialities;
}

#------------------------------------------------------------------------------
sub _manipulate_clinician_data {
    my $self = shift;
    my $row  = shift;

    my $id = sprintf '%s %s [%s]', @{$row}{qw(name init sex)};
    map delete $row->{$_}, qw(name init sex);
    $row->{name} = $id;

    my $org_code = $row->{org_code};
    $row->{org_code} = $locales->{$org_code} || "UNKNOWN LOCATION [$org_code]";
    my $speciality_code = $row->{speciality_code};
    $row->{speciality_code} = $specialities->{$speciality_code} || "UNKNOWN SPECIALITY [$speciality_code]";
}

#------------------------------------------------------------------------------
sub timings {
#    my $t0 = shift;
#    warn sprintf "%.4f sec\n", tv_interval $t0, [gettimeofday];
}

1;