Source code for sambuca_core.spectra_readers

# -*- coding: utf-8 -*-
""" Contains functions for loading collections of spectra from Sambuca
spectral database directories. """

from __future__ import absolute_import, division, print_function, unicode_literals
from builtins import *

import os

import numpy as np
import pandas as pd
import spectral.io.envi as envi
import spectral.io.spyfile as spyfile
import xlrd

from .exceptions import UnsupportedDataFormatError, DataValidationError
from .utility import list_files, strictly_increasing, merge_dictionary


def _validate_spectra_dataframe(spectra_dataframe):
    """ Internal function to validate a spectra data frame.

    Args:
        spectra_dataframe (pandas.DataFrame): the

    Returns:
        bool: True if the spectra is valid; otherwise false.
    """

    wavelengths = spectra_dataframe.index

    # are the band-centre wavelengths strictly increasing?
    if not strictly_increasing(wavelengths):
        return False

    # Are the wavelength spacings acceptable?
    # For now, only spectra that are specified with exact
    # 1nm bands are supported.
    band_diffs = np.ediff1d(wavelengths)
    if band_diffs.min() < 1.0 or band_diffs.max() > 1.0:
        # TODO: log warning about interpolation/averaging not being supported
        return False

    # The dtype of every column needs to be a numpy-compatible number
    if len(spectra_dataframe.select_dtypes(include=[np.number]).columns) != len(
        spectra_dataframe.columns
    ):
        return False

    return True


def _add_dataframe_spectra_to_dictionary(dataframe, base_name, dictionary=None):
    """ Adds all spectra from a dataframe to a dictionary, building the spectra
    name as 'base_name:column_name'
    """
    dictionary = {} if not dictionary else dictionary
    for column in dataframe:
        dictionary["{0}:{1}".format(base_name.lower(), column)] = (
            np.array(dataframe.index),
            dataframe[column].values,
        )
    return dictionary


[docs]def load_csv_spectral_library(filename, validate=True):
    """ Loads a spectral library from a CSV file.
    The CSV file must have a header row, and the wavelengths must be the first
    column.

    Args:
        filename (str): full path to the Excel file.
        validate (bool): If true, data validation will be performed.

    Returns:
       dict: A dictionary of 2-tuples of numpy.ndarrays.
            The first element contains the band centre wavelengths,
            while the second element contains the spectra.
            The dictionary is keyed by spectra name, formed by concatenation
            of the file and band names. This allows multiple spectra from
            multiple files to be unambigiously collected into a dictionary.
            Note that the filename component is always converted to lower case.
            This is required for consistent results on Linux and Windows.
    """
    dataframe = pd.read_csv(filename, index_col=0)
    if validate and not _validate_spectra_dataframe(dataframe):
        raise DataValidationError("{0} failed validation".format(filename))

    # if normalise:
    #     dataframe = _normalise_dataframe(dataframe)

    base_name, _ = os.path.splitext(os.path.basename(filename))
    return _add_dataframe_spectra_to_dictionary(dataframe, base_name)


[docs]def load_excel_spectral_library(filename, sheet_names=None, validate=True):
    """ Loads a spectral library from an Excel file. Both new style XLSX and
    old-style XLS formats are supported.

    Args:
        filename (str): full path to the Excel file.
        sheet_names (list): Optional list of worksheet names to load.
            The default is to attempt to load all worksheets.
        validate (bool): If true, data validation will be performed.

    Returns:
       dict: A dictionary of 2-tuples of numpy.ndarrays.
            The first element contains the band centre wavelengths,
            while the second element contains the spectra.
            The dictionary is keyed by spectra name, formed by concatenation
            of the file and band names. This allows multiple spectra from
            multiple files to be unambigiously collected into a dictionary.
            Note that the filename component is always converted to lower case.
            This is required for consistent results on Linux and Windows.
    """
    all_spectra = {}
    with pd.ExcelFile(filename) as excel_file:
        base_name, _ = os.path.splitext(os.path.basename(filename))
        # default is all sheets
        if not sheet_names:
            sheet_names = excel_file.sheet_names

        for sheet in sheet_names:
            try:
                dataframe = excel_file.parse(
                    sheet, index_col=0
                )  # the sheet as a DataFrame
                # OK, we have the data frame. Let's process it...
                if validate and not _validate_spectra_dataframe(dataframe):
                    continue

                # if normalise:
                #     dataframe = _normalise_dataframe(dataframe)

                all_spectra = _add_dataframe_spectra_to_dictionary(
                    dataframe, base_name, all_spectra
                )
            except xlrd.biffh.XLRDError:
                continue
            # except xlrd.biffh.XLRDError as xlrd_error:
            # TODO: log warning about invalid sheet

    return all_spectra


[docs]def load_envi_spectral_library(directory, base_filename, validate=True):
    """ Loads spectra from an ENVI spectral library.

    Args:
        directory (str): Directory containing the spectral library file.
        base_filename (str): The filename without the extension or '.'
            preceeding the extension.
        validate (bool): If true, data validation will be performed.

    Returns:
        dict: A dictionary of 2-tuples of numpy.ndarrays.
            The first element contains the band centre wavelengths,
            while the second element contains the spectra.
            The dictionary is keyed by spectra name, formed by concatenation
            of the file and band names. This allows multiple spectra from
            multiple files to be unambigiously collected into a dictionary.
            Note that the filename component is always converted to lower case.
            This is required for consistent results on Linux and Windows.
    """

    full_filename = os.path.join(directory, base_filename)
    file_pattern = "{0}.{1}"

    # load the spectral library
    try:
        spectral_library = envi.open(
            file_pattern.format(full_filename, "hdr"),
            file_pattern.format(full_filename, "lib"),
        )
    except spyfile.FileNotFoundError as exception:
        raise IOError(exception)

    # convert to a DataFrame for processing
    dataframe = pd.DataFrame(
        spectral_library.spectra.transpose(), index=spectral_library.bands.centers
    )
    dataframe.columns = spectral_library.names

    if validate and not _validate_spectra_dataframe(dataframe):
        raise DataValidationError(
            "Spectral library {0} failed validation".format(base_filename)
        )

    # merge the spectra into a dictionary
    return _add_dataframe_spectra_to_dictionary(dataframe, base_filename)


[docs]def load_all_spectral_libraries(path, validate=True):
    """ Loads all valid spectra from the given location.

    Args:
        path (str): The directory path to scan for supported spectra files.
        validate (bool): If true, data validation will be performed.

    Returns:
        dict: A dictionary of 2-tuples of numpy.ndarrays.
            The first element contains the band centre wavelengths of the input
            bands, while the second element contains the spectra values.
            Dictionary is keyed by spectra name built from the file and
            band/sheet names, separated by a colon.

            Note that names are not disambiguated, so that if more than one
            filter has the same name, only the first will be returned and no
            error will be raised (although it will be logged).

            Note that the filename component is always converted to lower case.
            This is required for consistent results on Linux and Windows.
    """
    # TODO: add logging
    # logging.getLogger(__name__).info(
    #     'Loading Sensor filters from %s', path)

    all_spectra = {}
    new_spectra = {}

    # excel files
    for file in list_files(path, ["xls", "xlsx"]):
        try:
            new_spectra = load_excel_spectral_library(file, validate=validate)
        except UnsupportedDataFormatError:
            pass
            # except UnsupportedDataFormatError as ex:
            # logging.getLogger(__name__).exception(ex)
            # TODO: logging
        merge_dictionary(all_spectra, new_spectra)

    # CSV files
    for file in list_files(path, ["csv"]):
        try:
            new_spectra = load_csv_spectral_library(file, validate=validate)
        except UnsupportedDataFormatError:
            pass
            # except UnsupportedDataFormatError as ex:
            # logging.getLogger(__name__).exception(ex)
            # TODO: logging
        merge_dictionary(all_spectra, new_spectra)

    # Spectral Libraries
    for file in list_files(path, ["lib"]):
        try:
            base_name, _ = os.path.splitext(os.path.basename(file))
            new_spectra = load_envi_spectral_library(path, base_name, validate=validate)
        except UnsupportedDataFormatError:
            pass
            # except UnsupportedDataFormatError as ex:
            # TODO: logging.getLogger(__name__).exception(ex)
        merge_dictionary(all_spectra, new_spectra)

    return all_spectra


[docs]def load_spectral_library(filename, validate=True):
    """ Loads a single spectral library from the given file name from any
    supported format (selected by file extension).

    Args:
        filename (str): full path to the file.
        validate (bool): If true, data validation will be performed.

    Returns:
        dict: A dictionary of 2-tuples of numpy.ndarrays.
            The first element contains the band centre wavelengths of the input
            bands, while the second element contains the spectra values.
            Dictionary is keyed by spectra name built from the file and
            band/sheet names, separated by a colon.
            For example: ``Moreton_Bay_speclib:white_sand``

            Note that names are not disambiguated, so that if more than one
            filter has the same name, only the first will be returned and no
            error will be raised (although it will be logged).

            Note that the filename component is always converted to lower case.
            This is required for consistent results on Linux and Windows.
    """
    # TODO: add logging
    # logging.getLogger(__name__).info(
    #     'Loading Sensor filters from %s', path)

    if not os.path.isfile(filename):
        raise IOError(filename)

    base_name, extension = os.path.splitext(os.path.basename(filename))
    extension = extension[1:].lower()

    # excel
    if extension in ["xls", "xlsx"]:
        return load_excel_spectral_library(filename, validate=validate)
    # CSV
    if extension in ["csv"]:
        return load_csv_spectral_library(filename, validate=validate)
    # ENVI Spectral Libraries
    elif extension in ["hdr", "lib"]:
        return load_envi_spectral_library(
            os.path.dirname(filename), base_name, validate=validate
        )

    raise UnsupportedDataFormatError(
        "filename {0} is not a supported format".format(filename)
    )