Source code for magni.reproducibility.io

"""
..
    Copyright (c) 2014-2017, Magni developers.
    All rights reserved.
    See LICENSE.rst for further information.

Module providing input/output functions to databases containing results from
reproducible research.

Routine listings
----------------
annotate_database(h5file)
    Function for annotating an existing HDF5 database.
chase_database(h5file)
    Function for chasing an existing HDF5 database.
create_database(h5file)
    Function for creating a new annotated and chased HDF5 database.
read_annotations(h5file)
    Function for reading annotations in an HDF5 database.
read_chases(h5file)
    Function for reading chases in an HDF5 database.
remove_annotations(h5file)
    Function for removing annotations in an HDF5 database.
remove_chases(h5file)
    Function for removing chases in an HDF5 database.
write_custom_annotation(h5file, annotation_name, annotation_value,
    annotations_sub_group=None)
    Write a custom annotation to an HDF5 database.

See Also
--------
magni.reproducibility._annotation.get_conda_info : Conda annotation
magni.reproducibility._annotation.get_git_revision : Git annotation
magni.reproducibility._annotation.get_platform_info : Platform annotation
magni.reproducibility._annotation.get_datetime : Date and time annotation
magni.reproducibility._annotation.get_magni_config : Magni config annotation
magni.reproducibility._annotation.get_magni_info : Magni info annotation
magni.reproducibility._chase.get_main_file_name : Magni main file name chase
magni.reproducibility._chase.get_main_file_source : Magni source code chase
magni.reproducibility._chase.get_main_source : Magni main source code chase
magni.reproducibility._chase.get_stack_trace : Magni stack trace chase

"""

from __future__ import division
import json
import os

import tables

from magni.reproducibility import _annotation
from magni.reproducibility import _chase
from magni.utils.multiprocessing import File as _File
from magni.utils.validation import decorate_validation as _decorate_validation
from magni.utils.validation import validate_generic as _generic
from magni.utils.validation import validate_numeric as _numeric


[docs]def annotate_database(h5file):
    """
    Annotate an HDF5 database with information about Magni and the platform.

    The annotation consists of a group in the root of the `h5file` having nodes
    that each provide information about Magni or the platform on which this
    function is run.

    Parameters
    ----------
    h5file : tables.file.File
        The handle to the HDF5 database that should be annotated.

    See Also
    --------
    magni.reproducibility._annotation.get_conda_info : Conda annotation
    magni.reproducibility._annotation.get_git_revision : Git annotation
    magni.reproducibility._annotation.get_platform_info : Platform annotation
    magni.reproducibility._annotation.get_datetime : Date and time annotation
    magni.reproducibility._annotation.get_magni_config : Magni config
        annotation
    magni.reproducibility._annotation.get_magni_info : Magni info annotation

    Notes
    -----
    The annotations of the database includes the following:

    * conda_info - Information about Continuum Anacononda install
    * git_revision - Git revision and tag of Magni
    * platform_info - Information about the current platform (system)
    * datetime - The current date and time
    * magni_config - Infomation about the current configuration of Magni
    * magni_info - Information from `help(magni)`

    Examples
    --------
    Annotate the database named 'db.hdf5':

    >>> import magni
    >>> from magni.reproducibility.io import annotate_database
    >>> with magni.utils.multiprocessing.File('db.hdf5', mode='a') as h5file:
    ...     annotate_database(h5file)

    """

    @_decorate_validation
    def validate_input():
        _generic('h5file', tables.file.File)

    validate_input()

    annotations = {'conda_info': json.dumps(_annotation.get_conda_info()),
                   'git_revision': json.dumps(_annotation.get_git_revision()),
                   'platform_info': json.dumps(
                       _annotation.get_platform_info()),
                   'datetime': json.dumps(_annotation.get_datetime()),
                   'magni_config': json.dumps(_annotation.get_magni_config()),
                   'magni_info': json.dumps(_annotation.get_magni_info())}

    try:
        annotations_group = h5file.create_group('/', 'annotations')
        for annotation in annotations:
            h5file.create_array(annotations_group, annotation,
                                obj=annotations[annotation].encode())
        h5file.flush()
    except tables.NodeError:
        raise tables.NodeError('The database has already been annotated. ' +
                               'Remove the existing annotation prior to ' +
                               '(re)annotating the database.')


[docs]def chase_database(h5file):
    """
    Chase an HDF5 database to track information about stack and source code.

    The chase consist of a group in the root of the `h5file` having nodes that
    each profide information about the program execution that led to this chase
    of the database.

    Parameters
    ----------
    h5file : tables.file.File
        The handle to the HDF5 database that should be chased.

    See Also
    --------
    magni.reproducibility._chase.get_main_file_name : Name of main file
    magni.reproducibility._chase.get_main_file_source : Main file source code
    magni.reproducibility._chase.get_main_source : Source code around main
    magni.reproducibility._chase.get_stack_trace : Complete stack trace

    Notes
    -----
    The chase include the following information:

    * main_file_name - Name of the main file/script that called this function
    * main_file_source - Full source code of the main file/script
    * main_source - Extract of main file source code that called this function
    * stack_trace - Complete stack trace up until the call to this function

    Examples
    --------
    Chase the database named 'db.hdf5':

    >>> import magni
    >>> from magni.reproducibility.io import chase_database
    >>> with magni.utils.multiprocessing.File('db.hdf5', mode='a') as h5file:
    ...     chase_database(h5file)

    """

    @_decorate_validation
    def validate_input():
        _generic('h5file', tables.file.File)

    validate_input()

    chases = {'main_file_name': json.dumps(_chase.get_main_file_name()),
              'main_file_source': json.dumps(_chase.get_main_file_source()),
              'main_source': json.dumps(_chase.get_main_source()),
              'stack_trace': json.dumps(_chase.get_stack_trace())}

    try:
        chase_group = h5file.create_group('/', 'chases')
        for chase in chases:
            h5file.create_array(chase_group, chase, obj=chases[chase].encode())
        h5file.flush()

    except tables.NodeError:
        raise tables.NodeError('The database has already been chased. ' +
                               'Remove the existing chase prior to ' +
                               '(re)chasing the database.')


[docs]def create_database(path, overwrite=True):
    """
    Create a new HDF database that is annotated and chased.

    A new HDF database is created and it is annotated using
    `magni.reproducibility.io.annotate_database` and chased using
    `magni.reproducibility.io.annotate_database`. If the `overwrite` flag is
    true and existing database at `path` is overwritten.

    Parameters
    ----------
    path : str
        The path to the HDF file that is to be created.
    overwrite : bool
        The flag that indicates if an existing database should be overwritten.

    See Also
    --------
    magni.reproducibility.io.annotate_database : Database annotation
    magni.reproducibility.io.chase_database : Database chase

    Examples
    --------
    Create a new database named 'new_db.hdf5':

    >>> from magni.reproducibility.io import create_database
    >>> create_database('new_db.hdf5')

    """

    @_decorate_validation
    def validate_input():
        _generic('path', 'string')
        _numeric('overwrite', 'boolean')

    validate_input()

    if not overwrite and os.path.exists(path):
        raise IOError('{!r} already exists in filesystem.'.format(path))

    with _File(path, mode='w') as h5file:
        annotate_database(h5file)
        chase_database(h5file)


[docs]def read_annotations(h5file):
    """
    Read the annotations to an HDF5 database.

    Parameters
    ----------
    h5file : tables.file.File
        The handle to the HDF5 database from which the annotations are read.

    Returns
    -------
    annotations : dict
        The annotations read from the HDF5 database.

    Raises
    ------
    ValueError
        If the annotations to the HDF5 database does not conform to the Magni
        annotation standard.

    Notes
    -----
    The returned dict holds a key for each annotation in the database. The
    value corresponding to a given key is in itself a dict. See
    `magni.reproducibility.annotate_database` for examples of such annotations.

    Examples
    --------
    Read annotations from the database named 'db.hdf5':

    >>> import magni
    >>> from magni.reproducibility.io import read_annotations
    >>> with magni.utils.multiprocessing.File('db.hdf5', mode='r') as h5file:
    ...    annotations = read_annotations(h5file)

    """

    @_decorate_validation
    def validate_input():
        _generic('h5file', tables.file.File)

    validate_input()

    try:
        h5_annotations = h5file.get_node('/', name='annotations')
    except tables.NoSuchNodeError:
        raise tables.NoSuchNodeError('The database has not been annotated.')

    annotations = dict()
    _recursive_annotation_read(h5_annotations, annotations)

    return annotations


[docs]def read_chases(h5file):
    """
    Read the chases to an HDF5 database.

    Parameters
    ----------
    h5file : tables.file.File
        The handle to the HDF5 database from which the chases are read.

    Returns
    -------
    chasess : dict
        The chases read from the HDF5 database.

    Raises
    ------
    ValueError
        If the chases to the HDF5 database does not conform to the Magni chases
        standard.

    Notes
    -----
    The returned dict holds a key for each chase in the database. The value
    corresponding to a given key is a string. See
    `magni.reproducibility.chase_database` for examples of such chases.

    Examples
    --------
    Read chases from the database named 'db.hdf5':

    >>> import magni
    >>> from magni.reproducibility.io import read_chases
    >>> with magni.utils.multiprocessing.File('db.hdf5', mode='r') as h5file:
    ...    chases = read_chases(h5file)

    """

    @_decorate_validation
    def validate_input():
        _generic('h5file', tables.file.File)

    validate_input()

    try:
        h5_chases = h5file.get_node('/', name='chases')
    except tables.NoSuchNodeError:
        raise tables.NoSuchNodeError('The database has not been chased.')

    h5_chase_dict = h5_chases._v_leaves
    chases = dict()
    try:
        for chase in h5_chase_dict:
            chases[chase] = json.loads(h5_chase_dict[chase].read().decode())
    except ValueError as e:
        raise ValueError('Unable to read the {!r} chase '.format(chase) +
                         'It seems that the chase does not conform to the ' +
                         'Magni chase standard ({!r}).'.format(e.args[0]))

    return chases


[docs]def remove_annotations(h5file):
    """
    Remove the annotations from an HDF5 database.

    Parameters
    ----------
    h5file : tables.file.File
        The handle to the HDF5 database from which the annotations are removed.

    Examples
    --------
    Remove annotations from the database named 'db.hdf5':

    >>> import magni
    >>> from magni.reproducibility.io import remove_annotations
    >>> with magni.utils.multiprocessing.File('db.hdf5', mode='a') as h5file:
    ...    remove_annotations(h5file)

    """

    @_decorate_validation
    def validate_input():
        _generic('h5file', tables.file.File)

    validate_input()

    try:
        h5file.remove_node('/', 'annotations', recursive=True)
        h5file.flush()
    except tables.NoSuchNodeError:
        pass


[docs]def remove_chases(h5file):
    """
    Remove the chases from an HDF5 database.

    Parameters
    ----------
    h5file : tables.file.File
        The handle to the HDF5 database from which the chases are removed.

    Examples
    --------
    Remove chases from the database named 'db.hdf5':

    >>> import magni
    >>> from magni.reproducibility.io import remove_chases
    >>> with magni.utils.multiprocessing.File('db.hdf5', mode='a') as h5file:
    ...    remove_chases(h5file)

    """

    @_decorate_validation
    def validate_input():
        _generic('h5file', tables.file.File)

    validate_input()

    try:
        h5file.remove_node('/', 'chases', recursive=True)
        h5file.flush()
    except tables.NoSuchNodeError:
        pass


[docs]def write_custom_annotation(h5file, annotation_name, annotation_value,
                            annotations_sub_group=None):
    """
    Write a custom annotation to an HDF5 database.

    The annotation is written to the `h5file` under the `annotation_name` such
    that it holds the `annotation_value`.

    Parameters
    ----------
    h5file : tables.file.File
        The handle to the HDF5 database to which the annotation is written.
    annotation_name : str
        The name of the annotation to write.
    annotation_value : a JSON serialisable object
        The annotation value to write.
    annotations_sub_group : str
        The group node under "/annotations" to which the custom annotation is
        written (the default is None which implies that the custom annotation
        is written directly under "/annotations").

    Notes
    -----
    The `annotation_value` must be a JSON seriablisable object.

    Examples
    --------
    Write a custom annotation to an HDF5 database.

    >>> import magni
    >>> from magni.reproducibility.io import write_custom_annotation
    >>> annotation_name = 'custom_annotation'
    >>> annotation_value = 'the value'
    >>> with magni.utils.multiprocessing.File('db.hdf5', mode='a') as h5file:
    ...    write_custom_annotation(h5file, annotation_name, annotation_value)
    ...    annotations = magni.reproducibility.io.read_annotations(h5file)
    >>> str(annotations['custom_annotation'])
    'the value'

    """

    @_decorate_validation
    def validate_input():
        _generic('h5file', tables.file.File)
        _generic('annotation_name', 'string')
        _generic('annotations_sub_group', 'string', ignore_none=True)

    validate_input()

    if annotations_sub_group is not None:
        annotations_group = '/'.join(['/annotations', annotations_sub_group])
    else:
        annotations_group = '/annotations'

    try:
        ann_val = json.dumps(annotation_value)
    except TypeError:
        raise TypeError('The annotation value does not have a valid JSON ' +
                        'representation. It may not be used as an annotation.')

    try:
        h5file.create_array(annotations_group, annotation_name,
                            obj=ann_val.encode(), createparents=True)
        h5file.flush()
    except tables.NodeError:
        raise tables.NodeError(
            'The annotation "{!r}" already exists '.format(annotation_name) +
            'in the database. Remove the old annotation before placing a ' +
            'new one.')


[docs]def _recursive_annotation_read(h5_annotations, out_annotations_dict):
    """
    Recursively read annotations from an annotation group

    Parameters
    ----------
    h5_annotations : tables.group.Group
        The group to read annotations from.
    out_annotations_dict : dict
        The dictionary to store the read annotations in.

    """

    leaves = h5_annotations._v_leaves
    subgroups = h5_annotations._v_groups

    # Read leaves
    try:
        for annotation_name, annotation_value in leaves.items():
            out_annotations_dict[annotation_name] = json.loads(
                annotation_value.read().decode())
    except ValueError as e:
        raise ValueError('Unable to read the {!r} '.format(annotation_name) +
                         'annotation. It seems that the annotation ' +
                         'does not conform to the Magni annotation ' +
                         'standard ({!r}).'.format(e.args[0]))

    # Recursively handle subgroups
    for subgroup in subgroups:
        out_annotations_dict[subgroup] = dict()
        _recursive_annotation_read(
            subgroups[subgroup], out_annotations_dict[subgroup])