Commit d4a86a9b authored by brivas's avatar brivas
Browse files

Merge branch 'docs' into 'master'

merging Docs to master

See merge request !2
parents 8a70a6ca 5f8f12b2
notebooks/.ipynb_checkpoints/
!.gitignore
!User_manual.docx
*.swp
/_build
/doctrees
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
docs/_static/images/elements.png

74.6 KB

docs/_static/images/fig1.png

159 KB

docs/_static/images/logos_c3s/LOGO_2020_-_NOC_1_COLOUR.png

122 KB

docs/_static/images/logos_c3s/copernicus.png

141 KB

docs/_static/images/logos_c3s/icoadsLogo.png

24.4 KB

docs/_static/images/logos_c3s/logo_c3s-392x154.png

30.3 KB

<svg id="mermaid-1620374813353" width="100%" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" height="524.193359375" style="max-width: 852.703125px;" viewBox="0 0 852.703125 524.193359375"><style>#mermaid-1620374813353{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#mermaid-1620374813353 .error-icon{fill:#552222;}#mermaid-1620374813353 .error-text{fill:#552222;stroke:#552222;}#mermaid-1620374813353 .edge-thickness-normal{stroke-width:2px;}#mermaid-1620374813353 .edge-thickness-thick{stroke-width:3.5px;}#mermaid-1620374813353 .edge-pattern-solid{stroke-dasharray:0;}#mermaid-1620374813353 .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-1620374813353 .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-1620374813353 .marker{fill:#333333;stroke:#333333;}#mermaid-1620374813353 .marker.cross{stroke:#333333;}#mermaid-1620374813353 svg{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;}#mermaid-1620374813353 .label{font-family:"trebuchet ms",verdana,arial,sans-serif;color:#333;}#mermaid-1620374813353 .cluster-label text{fill:#333;}#mermaid-1620374813353 .cluster-label span{color:#333;}#mermaid-1620374813353 .label text,#mermaid-1620374813353 span{fill:#333;color:#333;}#mermaid-1620374813353 .node rect,#mermaid-1620374813353 .node circle,#mermaid-1620374813353 .node ellipse,#mermaid-1620374813353 .node polygon,#mermaid-1620374813353 .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-1620374813353 .node .label{text-align:center;}#mermaid-1620374813353 .node.clickable{cursor:pointer;}#mermaid-1620374813353 .arrowheadPath{fill:#333333;}#mermaid-1620374813353 .edgePath .path{stroke:#333333;stroke-width:1.5px;}#mermaid-1620374813353 .flowchart-link{stroke:#333333;fill:none;}#mermaid-1620374813353 .edgeLabel{background-color:#e8e8e8;text-align:center;}#mermaid-1620374813353 .edgeLabel rect{opacity:0.5;background-color:#e8e8e8;fill:#e8e8e8;}#mermaid-1620374813353 .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-1620374813353 .cluster text{fill:#333;}#mermaid-1620374813353 .cluster span{color:#333;}#mermaid-1620374813353 div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:12px;background:hsl(80,100%,96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-1620374813353:root{--mermaid-font-family:"trebuchet ms",verdana,arial,sans-serif;}#mermaid-1620374813353 flowchart{fill:apa;}</style><g><g class="output"><g class="clusters"><g class="cluster" id="flowchart-data_models-9898" transform="translate(304.6796875,126)" style="opacity: 1;"><rect style="fill:#ffffff;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;" width="593.359375" height="236" x="-296.6796875" y="-118"></rect><g class="label" transform="translate(0, -104)" id="mermaid-1620374813353Text"><g style="text-align: center;" transform="translate(-45.5859375,-9.5)"><foreignObject width="91.171875" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">data_models</div></foreignObject></g></g></g><g class="cluster" id="flowchart-Data-9899" transform="translate(164.4765625,398.193359375)" style="opacity: 1;"><rect style="fill:#ffffff;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;" width="312.953125" height="236" x="-156.4765625" y="-118"></rect><g class="label" transform="translate(0, -104)" id="mermaid-1620374813353Text"><g style="text-align: center;" transform="translate(-16.484375,-9.5)"><foreignObject width="32.96875" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">Data</div></foreignObject></g></g></g></g><g class="edgePaths"><g class="edgePath LS-B LE-H" id="L-B-H" style="opacity: 1;"><path class="path" d="M213.90458079592597,134.806640625L231.7460048299383,124.3388671875C249.58742886395066,113.87109375,285.27027693197533,92.935546875,307.27836763265435,82.4677734375C329.2864583333333,72,337.6197916666667,72,345.953125,72C354.2864583333333,72,362.6197916666667,72,366.7864583333333,72L370.953125,72" marker-end="url(#arrowhead7605)" style="fill:none"></path><defs><marker id="arrowhead7605" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g><g class="edgePath LS-B LE-E" id="L-B-E" style="opacity: 1;"><path class="path" d="M229.671875,170.55353682472915L244.88541666666666,172.12794735394095C260.0989583333333,173.70235788315276,290.5260416666667,176.85117894157636,309.90625,178.4255894707882C329.2864583333333,180,337.6197916666667,180,349.2421875,180C360.8645833333333,180,375.7760416666667,180,383.2317708333333,180L390.6875,180" marker-end="url(#arrowhead7606)" style="fill:none"></path><defs><marker id="arrowhead7606" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g><g class="edgePath LS-one LE-mdf_reader" id="L-one-mdf_reader" style="opacity: 1;"><path class="path" d="M223.96875,344.193359375L240.1328125,344.193359375C256.296875,344.193359375,288.625,344.193359375,308.9557291666667,344.193359375C329.2864583333333,344.193359375,337.6197916666667,344.193359375,351.1156654719489,345.4491131738844C364.6115392772311,346.7048669727689,383.26995355446223,349.21637457053777,392.5991606930777,350.4721283694223L401.92836783169327,351.72788216830673" marker-end="url(#arrowhead7607)" style="fill:none"></path><defs><marker id="arrowhead7607" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g><g class="edgePath LS-two LE-mdf_reader" id="L-two-mdf_reader" style="opacity: 1;"><path class="path" d="M295.953125,452.193359375L300.1197916666667,452.193359375C304.2864583333333,452.193359375,312.6197916666667,452.193359375,320.953125,452.193359375C329.2864583333333,452.193359375,337.6197916666667,452.193359375,355.2623201419845,442.6487133711512C372.9048486173024,433.1040673673024,399.85657223460476,414.01477535960476,413.33243404325594,404.470129355756L426.8082958519072,394.9254833519072" marker-end="url(#arrowhead7608)" style="fill:none"></path><defs><marker id="arrowhead7608" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g><g class="edgePath LS-B LE-mdf_reader" id="L-B-mdf_reader" style="opacity: 1;"><path class="path" d="M229.671875,183.88620445666282L244.88541666666666,188.57183704721902C260.0989583333333,193.25746963777522,290.5260416666667,202.62873481888764,309.90625,207.3143674094438C329.2864583333333,212,337.6197916666667,212,356.8795177109419,229.52412812239137C376.13924375521725,247.04825624478272,406.32536251043456,282.0965124895655,421.41842188804316,299.62064061195684L436.51148126565175,317.14476873434825" marker-end="url(#arrowhead7609)" style="fill:none"></path><defs><marker id="arrowhead7609" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g><g class="edgePath LS-mdf_reader LE-D" id="L-mdf_reader-D" style="opacity: 1;"><path class="path" d="M555.54296875,360.88671875L563.1790364583334,360.8033854166667C570.8151041666666,360.7200520833333,586.0872395833334,360.5533854166667,597.8899739583334,360.4700520833333C609.6927083333334,360.38671875,618.0260416666666,360.38671875,626.359375,360.38671875C634.6927083333334,360.38671875,643.0260416666666,360.38671875,647.1927083333334,360.38671875L651.359375,360.38671875" marker-end="url(#arrowhead7610)" style="fill:none"></path><defs><marker id="arrowhead7610" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g></g><g class="edgeLabels"><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-B-H" class="edgeLabel L-LS-B' L-LE-H"></span></div></foreignObject></g></g><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-B-E" class="edgeLabel L-LS-B' L-LE-E"></span></div></foreignObject></g></g><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-one-mdf_reader" class="edgeLabel L-LS-one' L-LE-mdf_reader"></span></div></foreignObject></g></g><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-two-mdf_reader" class="edgeLabel L-LS-two' L-LE-mdf_reader"></span></div></foreignObject></g></g><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-B-mdf_reader" class="edgeLabel L-LS-B' L-LE-mdf_reader"></span></div></foreignObject></g></g><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-mdf_reader-D" class="edgeLabel L-LS-mdf_reader' L-LE-D"></span></div></foreignObject></g></g></g><g class="nodes"><g class="node default" id="flowchart-H-9887" transform="translate(473.65625,72)" style="opacity: 1;"><rect rx="0" ry="0" x="-102.703125" y="-29" width="205.40625" height="58" class="label-container" style="fill:#e8eaf6;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;"></rect><g class="label" transform="translate(0,0)"><g style="text-align: center;" transform="translate(-92.703125,-19)"><foreignObject width="185.40625" height="38"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">code_tables <br/> default: ICOADS.keycodes</div></foreignObject></g></g></g><g class="node default" id="flowchart-B-9886" transform="translate(164.4765625,163.806640625)" style="opacity: 1;"><rect rx="0" ry="0" x="-65.1953125" y="-29" width="130.390625" height="58" class="label-container" style="fill:#e8eaf6;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;"></rect><g class="label" transform="translate(0,0)"><g style="text-align: center;" transform="translate(-55.1953125,-19)"><foreignObject width="110.390625" height="38"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">Schema <br/> default: imma1</div></foreignObject></g></g></g><g class="node default" id="flowchart-E-9889" transform="translate(473.65625,180)" style="opacity: 1;"><rect rx="0" ry="0" x="-82.96875" y="-29" width="165.9375" height="58" class="label-container" style="fill:#e8eaf6;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;"></rect><g class="label" transform="translate(0,0)"><g style="text-align: center;" transform="translate(-72.96875,-19)"><foreignObject width="145.9375" height="38"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">schema.json <br/> default: imma1.json</div></foreignObject></g></g></g><g class="node default" id="flowchart-one-9884" transform="translate(164.4765625,344.193359375)" style="opacity: 1;"><rect rx="0" ry="0" x="-59.4921875" y="-29" width="118.984375" height="58" class="label-container" style="fill:#e8eaf6;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;"></rect><g class="label" transform="translate(0,0)"><g style="text-align: center;" transform="translate(-49.4921875,-19)"><foreignObject width="98.984375" height="38"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">ICOADS files <br/> .imma format</div></foreignObject></g></g></g><g class="node default" id="flowchart-two-9885" transform="translate(164.4765625,452.193359375)" style="opacity: 1;"><rect rx="0" ry="0" x="-131.4765625" y="-29" width="262.953125" height="58" class="label-container" style="fill:#e8eaf6;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;"></rect><g class="label" transform="translate(0,0)"><g style="text-align: center;" transform="translate(-121.4765625,-19)"><foreignObject width="242.953125" height="38"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">any other data stored <br/> in a fix-width or delimited format</div></foreignObject></g></g></g><g class="node default" id="flowchart-mdf_reader-9891" transform="translate(473.65625,360.38671875)" style="opacity: 1;"><polygon points="81.38671875,0 162.7734375,-81.38671875 81.38671875,-162.7734375 0,-81.38671875" transform="translate(-81.38671875,81.38671875)" class="label-container" style="fill:#fcc679;stroke:#333;stroke-width:1px;font-size:20px;font-weight:100;"></polygon><g class="label" transform="translate(0,0)"><g style="text-align: center;" transform="translate(-60.9296875,-9.5)"><foreignObject width="121.859375" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">mdf_reader.read</div></foreignObject></g></g></g><g class="node default" id="flowchart-D-9897" transform="translate(748.03125,360.38671875)" style="opacity: 1;"><rect rx="0" ry="0" x="-96.671875" y="-57.5" width="193.34375" height="115" class="label-container" style="fill:#e8eaf6;stroke:#333;stroke-width:1px;font-size:20px;font-weight:500;"></rect><g class="label" transform="translate(0,0)"><g style="text-align: center;" transform="translate(-86.671875,-47.5)"><foreignObject width="173.34375" height="95"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">Output: <br/> <br/> pandas.Dataframe <br/> structured according to <br/> the schema</div></foreignObject></g></g></g></g></g></g></svg>
\ No newline at end of file
docs/_static/images/new_schema.png

34.1 KB

docs/_static/images/schema.png

12.8 KB

#!/bin/bash
set -x
################################################################################
# File: buildDocs.sh
# Purpose: Script that builds our documentation using sphinx and updates GitHub
# Pages. This script is executed by:
# .github/workflows/docs_pages_workflow.yml
#
# Authors: Beatriz Recinos <beatriz.recinos.rivas@noc.ac.uk>
# Created: 2021-06-24
# Updated: 2021-06-24
# Version: 0.1
################################################################################
###################
# INSTALL DEPENDS #
###################
apt-get update
apt-get -y install git rsync python3-sphinx python3-sphinx-rtd-theme
#####################
# DECLARE VARIABLES #
#####################
pwd
ls -lah
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
##############
# BUILD DOCS #
##############
# build our documentation with sphinx (see docs/conf.py)
# * https://www.sphinx-doc.org/en/master/usage/quickstart.html#running-the-build
make -C docs clean
make -C docs html
#######################
# Update GitHub Pages #
#######################
git config --global user.name "${GITHUB_ACTOR}"
git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com"
docroot=`mktemp -d`
rsync -av "docs/_build/html/" "${docroot}/"
pushd "${docroot}"
# don't bother maintaining history; just generate fresh
git init
git remote add deploy "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git"
git checkout -b gh-pages
# add .nojekyll to the root so that github won't 404 on content added to dirs
# that start with an underscore (_), such as our "_content" dir..
touch .nojekyll
# Add README
cat > README.md <<EOF
# GitHub Pages Cache
Nothing to see here. The contents of this branch are essentially a cache that's not intended to be viewed on github.com.
If you're looking to update our documentation, check the relevant development branch's 'docs/' dir.
EOF
# copy the resulting html pages built from sphinx above to our new git repo
git add .
# commit all the new files
msg="Updating Docs for commit ${GITHUB_SHA} made on `date -d"@${SOURCE_DATE_EPOCH}" --iso-8601=seconds` from ${GITHUB_REF} by ${GITHUB_ACTOR}"
git commit -am "${msg}"
# overwrite the contents of the gh-pages branch on our github.com repo
git push deploy gh-pages --force
popd # return to main repo sandbox root
# exit cleanly
exit 0
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'mdf_reader'
copyright = '2021, David Berry, Irene Perez Gonzalez and Beatriz Recinos'
author = 'David Berry, Irene Perez Gonzalez and Beatriz Recinos'
# The full version, including alpha/beta/rc tags
release = 'v1.3'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
# extensions = []
# extensions.append('autoapi.extension')
extensions = ['sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.viewcode',
'autoapi.extension',
'sphinx.ext.napoleon',
'sphinx_autodoc_typehints']
autoapi_type = 'python'
autoapi_dirs = ['../']
add_module_names = False
autoapi_keep_files = False
autodoc_typehints = "description"
#autoapi_options = ['members', 'undoc-members', 'private-members']
autoapi_options = ['members', 'undoc-members', 'private-members', 'show-inheritance',
'show-module-summary', 'special-members', 'imported-members']
autoapi_ignore = ['*mymodel*', '*conf*', '*gather_stats_c99.py*']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
pygments_style = 'sphinx'
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme_options = {
'logo_only': True,
'display_version': False,
"collapse_navigation": False,
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
\ No newline at end of file
.. mdf_reader documentation master file, created by
sphinx-quickstart on Fri Apr 16 14:18:24 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
.. _data-models:
===========
Data Models
===========
Schema
======
The schema file gathers a collection of descriptors that enables the mdf_reader to access and extract meaningful units of information for each element.
Valid schemas files are json files that the tool accesses and stores internally as dictionaries. The basename of the schema file must be the same as the data model directory and its extension ``.json``
.. figure:: _static/images/schema.png
:width: 45%
Data model directory
There are two levels of information in the schema:
1. **General** information on the data format layout, that helps the tool decide which approach to follow in order to access the data content. This information is included in the **header block** at the top of the schema (see figure below).
2. **Specific** information on the data elements and, optionally, on the sections. In the case that the data model has its report elements organised in one or multiple sections (as shown in the figure below). This information is included in the **elements block** of the schema.
.. figure:: _static/images/new_schema.png
:width: 80%
Content inside a ``schema.json`` file.
The mdf_reader supports reading and validation of both internal and external schemas:
- An **internal data model** has its schema registered within the tool. To read and validate data from these models, we only need to pass its reference name to the reader and validation modules, using the argument ``data_model``. A list of the reference names for internally supported data models can be access via the tool's function::
import mdf_reader
mdf_reader.properties.supported_data_models()
- An **external data model** is a data format that is unknown to the tool. If the data model meets the specifications for which the tool was built, then a model can be built externally and fed into it for both functions data reading and model validation using the argument ``data_model_path``::
model_path = '~/mdf_reader/data_models/lib/imma1_d701'
data_file_path = '~/mdf_reader/tests/data/069-701_1845-04_subset.imma'
data = mdf_reader.read(data_file_path, data_model_path= model_path)
.. _code-tables:
Code tables
===========
.. figure:: _static/images/elements.png
:width: 80%
Element content inside a ``schema.json`` file.
Elements defined in the data model ``schema.json`` with an element attribute ``"column_type": "key"`` are linked to a code table in the data model through a codetable descriptor in the schema (e.g. ``"codetable": "ICOADS.C99.FORM"``). Code tables contain the ``key:value`` pairs and are stored as individual ``.json`` files in the ``data_models/schema/code_tables`` subdirectory.
The content of a code table translating a ship-log report type into its real meaning (``ICOADS.C99.FORM.json``) can be seen in text below::
{
" 1": "daily",
" 2": "reports more than once a day"
}
This code table is part of the ``imma1_d701`` data model included in this tool.
The following range of code table structures are currently supported:
- Simple code tables: code tables with a list of ``key:value`` pairs.
- Nested code tables: code tables with multiple (2 or more) keys mapping to a value ``-> key(1):…:key(n):value.``
- Range-keyed code tables: code tables (simple or multi-keyed) where one or more keys is a (integer) range of values.
Code tables can be imported as python dictionaries directly using the json package. To be fully read by the tool, however, keys in **range-keyed code tables** need to be expanded and access to all code tables is managed in the application through a **code table manager module**.
The following commands typed in a python console, show how to access code table templates to create new code tables::
template_names = mdf_reader.code_tables.templates()
To copy a template to edit::
mdf_reader.code_tables.copy_template(template_name,out_path=file_path)
or::
mdf_reader. code_tables.copy_template(template_name,out_dir=dir_path)
Common features
---------------
As code tables are stored as ``.json`` files, the json syntax rules must be met when they are generated. See the following `link <https://www.w3schools.com/js/js_json_syntax.asp>`_ to a basic introduction to json syntax.
To create code tables it is important to highlight that:
- String values must be written with double quotes
- Keys must be strings
- Values can be strings, numbers, objects (JSON objects), arrays, booleans (``true|false``) or ``null``.
- Due to the way range keyed tables are parsed, keys cannot have the string ``range_key`` as initial substring (unless they are range keys).
Simple code tables
------------------
Simple code tables are built using a single json object (enclosed in curly braces) with the ``key:value`` pairs separated by commas like the following example for a weather visibility indicator, the file name is ``visibility_ind.json``::
{
" ": "Not measured",
"0": "Measured",
"1": "Fog present"
}
Nested code tables
------------------
Nested code tables are included to deal with situations when a coded element's encoding, varies according to an indicator (contained in a different element in the data) or/and changes along time (different code table versions). Instead of storing these tables in separate files, the tool allows to create nested code tables.
The following ``.json`` file example shows a code table with 2 levels of indexing. It is built as a single **json object** in which the values of the ``key:value`` pairs of the outer indexing level are simple code tables, instead of individual values.
Nested table (named: ``visibility.json``) example::
{
"0":
{"90":"<0.05 km",
"91":"0.05 km",
"92":"0.2 km",
"93":"0.5 km",
"94":"1 km",
"95":"2 km",
"96":"4 km",
"97":"10 km",
"98":"20 km",
"99":"50 km or more"},
"1":
{"90":"<0.05 km",
"91":"0.05 km",
"92":"0.2 km",
"93":"Fog present, no visibility reported",
"94":"1 km",
"95":"2 km",
"96":"4 km",
"97":"10 km",
"98":"20 km",
"99":"50 km or more"}
}
This type of nested code table requires an additional ``.keys`` (named: ``visibility.keys``) file with the following format::
{
"('core1','VIS')" : ["('core1','VIS I')","('core1','VIS')"]
}
This **code_table** can be called from the ``schema.json`` by setting the element descriptor ``column_type`` to ``key`` in the following way::
"VIS": {
"description": "Visibility",
"field_length": 2,
"column_type": "key",
"codetable": "visibility"
}
Note that only the **nested code table** ``visibility`` is called not the .keys, and we do not require the ``.json`` extension.
The data file schema provides the ``element:codetable`` correspondence. However, to map the element to its value in the code table, it is necessary to know the elements in the data file from which the outer keys are derived. Each nested table ``table_name.json`` has a companion ``.json`` file ``table_name.keys`` with a set of ``key:value`` pairs. The key is the actual element the table decodes and the value is a list with the complete set of key elements, from outer to inner.
As a single table can be potentially used to code different data file elements, a key must be provided for every element wishing to be decoded with a nested table (even if it is unique)
Range-keyed code tables
-----------------------
Range-keyed code tables can be any a simple or a nested type of code table. This term will apply if any of its ``key:value`` pairs is a range, like a period of years (1910-1945) or simply an integer interval (1-10).
Instead of building the table repeating each of the ``key:value`` pairs for every value in the range, the corresponding range key pairs are defined as range (init, end [, step]):value in the json file. The code table manager will identify this special type of key and will expand the keys in the dictionary as is read internally.
Range keys rules and use:
- Only integer ranges are currently supported
- Parameter step is optional. Defaults to 1.
- In ranges that apply to a range of years, the keyword yyyy can be used in the place of the end parameter. It will expand the period to the current year.
Example of a Range-key nested table named: ``ICOADS.CO.VS.json`` is shown below::
{
"range_key(1750,1967)":
{
"0":"0 knots;[0.0,0.0,0.0] ms-1",
"1":"1-3 knots;[0.51444,1.02888,1.54332] ms-1",
"2":"4-6 knots;[2.05776,2.5722,3.08664] ms-1",
"3":"7-9 knots;[3.60108,4.11552,4.62996] ms-1",
"4":"10-12 knots;[5.1444,5.65884,6.17328] ms-1",
"5":"13-15 knots;[6.68772,7.20216,7.7166] ms-1",
"6":"16-18 knots;[8.23104,8.74548,9.25992] ms-1",
"7":"19-21 knots;[9.77436,10.2888,10.8032] ms-1",
"8":"22-24 knots;[11.3177,11.8321,12.3466] ms-1",
"9":"over 24 knots;[12.3466,12.861,null] ms-1"
},
"range_key(1968,yyyy)":
{
"0":"0 knots;[0.0,0.0,0.0] ms-1",
"1":"1-5 knots;[0.51444,1.54332,2.5722] ms-1",
"2":"6-10 knots;[3.08664,4.11552,5.1444] ms-1",
"3":"11-15 knots;[5.65884,6.68772,7.7166] ms-1",
"4":"16-20 knots;[8.23104,9.25992,10.2888] ms-1",
"5":"21-25 knots;[10.8032,11.8321,12.861] ms-1",
"6":"26-30 knots;[13.3754,14.4043,15.4332] ms-1",
"7":"31-35 knots;[15.9476,16.9765,18.0054] ms-1",
"8":"36-40 knots;[18.5198,19.5487,20.5776] ms-1",
"9":"over 40 knots;[21.092,22.1209,null] ms-1"
}
}
As is nested the corresponding ``ICOADS.CO.VS.keys`` file looks as follows::
{
"('core','VS')" : ["('core','YR')","('core','VS')"]
}
.. mdf_reader documentation master file, created by
sphinx-quickstart on Fri Apr 16 14:18:24 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
.. _getting-started:
Getting started
===============
1. Test the tool
You can test the tool very easy by using a sample data set that comes with the repository. For this you need to run the following code::
import sys
sys.path.append('/path_to_folder_directory_containing_the_mdf_reader_folder/')
import mdf_reader
import matplotlib.pyplot as plt
data = mdf_reader.tests.read_imma1_buoys_nosupp()
2. Read an IMMA file
Read a sample ``.imma`` file from the folder ``~/mdf_reader/test/data/`` via the following code::
filepath = '~/mdf_reader/test/data/069-701_1845-04_subset.imma'
imma_data = mdf_reader.read(filepath, data_model = 'imma1',sections = ['core','c1','c98'])
For more details on how to run this in your python session see :py:func:`mdf_reader.read.main()`
3. To call the function from a terminal type::
$ python mdf_reader_dir/read.py source data_model data_model_path sections chunksize skiprows out_path
For more details and an overview of the tool check out the following python notebook:
- `Test and overview of the mdf_reader tool <https://git.noc.ac.uk/brecinosrivas/mdf_reader/-/blob/master/docs/notebooks/mdf_reader_test_overview.ipynb>`_
This diff is collapsed.
.. mdf_reader documentation master file, created by
sphinx-quickstart on Fri Apr 16 14:18:24 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Data reader toolbox documentation
---------------------------------
The **mdf_reader** is a `python3 <https://www.python.org/download/releases/3.0/>`_ tool designed to read data files compliant with a user specified data model.
It was developed with the initial idea of reading data from the `International Comprehensive Ocean-Atmosphere Data Set (ICOADS) <https://icoads.noaa.gov/>`_ stored in the `International Maritime Meteorological Archive (IMMA) data format <https://icoads.noaa.gov/e-doc/imma/R3.0-imma1.pdf>`_.
The tool has been further enhanced to account for any marine meteorological data format, provided that this data meets the following specifications:
- Data is stored in a human-readable manner: `ASCII <https://en.wikipedia.org/wiki/ASCII>`_.
- Data is organized in single line reports (e.g. rows of observations separated by a delimiter like .csv).
- Reports have a coherent internal structure that can be modelized.
- Reports are fixed width or field delimited types.
- Reports can be organized in sections, in which case each section can be of different types (fixed width of delimited).
The mdf_reader uses the information provided in a `data model <https://en.wikipedia.org/wiki/Data>`_ to read meteorological data into a python `pandas.DataFrame <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html>`_, with the column names and data types set according to each data element’s description specified in the data model or **schema**. In addition to reading, the mdf_reader validates data elements against the **schema** provided.
This tool outputs a python object with the following attributes:
1. A `pandas.DataFrame <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html>`_ (DF) with the data values.
2. A `boolean pandas <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.bool.html>`_ DF with the data validation mask.
3. A `dictionary <https://realpython.com/python-dicts/>`_ with a simplified version of the input data model.
The reader allows for basic transformations of the data. This feature includes `basic numeric data decoding <https://realpython.com/python-encodings-guide/#enter-unicode>`_ (base36, signed_overpunch) and numeric data conversion (scale and offset).
Several data models have been added to the tool including the IMMA schema: ``~/mdf_reader/data_models/lib/imma1``.
.. note:: **Data from other data models than those already available can be read, providing that this data meets the basic specifications listed above. A data model can be built externally and fed into the tool.**
.. toctree::
:maxdepth: 2
:glob:
:hidden:
:caption: Guide
tool-set-up.rst
tool-overview.rst
getting-started.rst
data-models.rst
how-to-build-a-data-model.rst
About
-----
:Version:
:Citation:
:License:
:Authors:
David Berry, Irene Perez Gonzalez and Beatriz Recinos
.. image:: _static/images/logos_c3s/logo_c3s-392x154.png
:width: 25%
:target: https://climate.copernicus.eu/
.. image:: _static/images/logos_c3s/LOGO_2020_-_NOC_1_COLOUR.png
:width: 25%
:target: https://noc.ac.uk/
.. image:: _static/images/logos_c3s/icoadsLogo.png
:width: 20%
:target: https://icoads.noaa.gov/
\ No newline at end of file
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
.. mdf_reader documentation master file, created by
sphinx-quickstart on Fri Apr 16 14:18:24 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Tool overview
=============
In the tool's context, a data model is the combination of a **schema file** with information on the file format and its contents and, optionally, the data model contains a set of code tables with ``key:value`` pairs, to translate encoded information in some data elements:
e.g. Temperature units might be store as numeric values 1 or 2 and this translates to ``1:Celsius`` and ``2:Fahrenheit``.
Workflow
--------
.. figure:: _static/images/mdf_reader_diagram.svg
:width: 100%
Simplified workflow of the main function in the tool
Input data: ``.imma`` files and schemas
---------------------------------------
The tool has been created to read meteorological data from `ICOADS <https://icoads.noaa.gov/r3.html>`_ stored in the ``.imma`` format, please read the `following guide <https://icoads.noaa.gov/e-doc/imma/R3.0-imma1.pdf>`_ to know more details regarding the database and the data format.
Each meteorological report in ICOADS can come from multiple countries, sources and platforms and each report has a source ID (SID) and a deck (DCK) number assigned. “Deck” was originally referred to a punched card deck, but is now used as the primary field to track ICOADS data **collections**. Each deck may contain a single Source ID (SID) or a mixture of SIDs.
The data stored in the ``.imma`` format is stored as a fixed width and/or a field delimited file. The mdf_reader reads the data, organise it into sections and validates them against a declared data model (also referred here as **schema**) which can be source ID and deck dependent.
The **core** meteorological variables stored in the ``.imma`` format can be read by using the general ``imma1`` schema included in this tool.
**Supplemental metadata attachments** require a specific **schema** customized to read supplemental metadata from a specific source and deck ("collection"). Several **schemas** are already included in this tool in order to read 18th century ship meteorological metadata.
All schemas are located under the following directory: ``~/mdf_reader/data_models/lib/``
.. note:: For each SID-DCK number the data model or schema use to read supplemental metadata will different. e.g. to read metadata from the `US Maury <https://icoads.noaa.gov/maury.html>`_ Ship data collection SID 69 and DCK 701, we will use the schema ``imma_d701``)
Output:
-------
The output of the mdf_reader is a python object with three attributes:
• **data**: python `pandas.DataFrame <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html>`_ with data values.
• **atts**: `python dictionary <https://docs.python.org/3/tutorial/datastructures.html#dictionaries>`_ with attributes of each of the output elements inherited from the input data model **schema**.
• **mask**: boolean DF with the results of the validation of each of the data model elements in its columns.
Processing of the data elements
-------------------------------
The individual data element definitions in the schema determines how each element is extracted, transformed and validated within the tool. If the data model or schema has its data elements organised in sections, the reader first identifies the string chunks corresponding to the different sections.
If the data model has no sections, the reader works with the full report as a single chunk.
Afterwards, data elements are extracted from each of these chunks, as shown in the figure below, where each element in the input dataframe is linked to its attributes (orange text) defined within the data model/schema (e.g. elements encoding type, bytes length, etc).
.. figure:: _static/images/fig1.png
:width: 100%
Schematic representation of the integral process of reading, transforming and validating a data element.
Data elements extraction and transformation
-------------------------------------------
The data element extraction and transformation from the initial string to the output dataframe occurs mainly in 3 steps:
1. **Elements extraction and missing data tagging**:
Done using `mdf_reader.import_data.main() <https://mdf-reader.readthedocs.io/en/mdf_reader/autoapi/mdf_reader/reader/import_data/index.html#module-mdf_reader.reader.import_data>`_, where individual data elements are extracted as 'objects' from the full report string and missing data is recognised as ``NA/NaN`` values in the resulting dataframe.
Strings that are recognised as missing from the source are `pandas` defaults, plus:
* Those defined in the data model's/schema as NaN by making use of the ``missing_value`` attribute.
* Those defined as blanks if ``disable_white_strip`` is set to not ``True``
2. **Unpacking of encoded elements**:
Data elements with encoding defined in the schema element attributes are decoded and casted to their declared ``column_type`` [#f1]_. Elements where the decoding fails or is not recognised by the tool, are marked as ``NA/NaN`` values in the resulting dataframe.
3. **Element conversion**:
Data elements are converted (and optionally transformed) to their final data types (and units) if specified in the data model/schema.
*Numeric* type elements:
* Safe conversion to numeric; ``NaN`` where conversion is not possible.
* There is the option of applying to each element a *scale* and an *offset*: ``offset + scale*i``
* Safe conversion of ``column_type``
*object*, *string* and *key* type elements:
Leading and trailing whitespace stripping unless otherwise declared in ``disable_white_strip`` (disable all, leading or trailing blank stripping).
*datetime* type elements:
Safe parsing to datetime objects with `pandas.to_datetime() <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html>`_, assigning `NaT` where the conversion is not possible.
Validation of elements against the schema or data model
-------------------------------------------------------
Data model validation is initiated after each element unpacking and conversion. New ``Na/NaN`` values in the data (not identified as missing values during extraction) are understood by the tool to have fail unpacking or conversion, and thus, are not validate against the data model. The resulting preliminary validation mask values are:
* ``False``: invalid decoding, conversion
* ``True``: missing data, rest
Once elements are in the final form, *numeric* and *key* elements are validated against their corresponding attributes in the schema (``valid_max|valid_min`` and ``codetable``, respectively), with the final values in the validation mask being:
* ``False``: invalid decoding, conversion, data model values
* ``True``: missing data, rest
Overall, the validation process exception handling is:
* Missing values: ``True``
* Numeric type elements where either upper|lower bound is missing: ``False``
* key type elements where no codetable is found (or defined in the data model): ``False``
* Rest: ``True``
.. rubric:: Footnotes
.. [#f1] If ``NaN`` values are present, and column_type is integer, conversion to column_type will not be possible and data type will be as pandas casting rules (`Missing data casting rules and indexing <https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html>`_).
.. mdf_reader documentation master file, created by
sphinx-quickstart on Fri Apr 16 14:18:24 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Tool set up
===========
The mdf_reader is a pure Python package, but it has a few dependencies that rely in a specific python and module version. The tool has been tested with Python version 3.7 on Linux and Mac OS systems.
1. Clone the repository
~~~~~~~~~~~~~~~~~~~~~~~~
Clone the latest version via::
$ git clone git@git.noc.ac.uk:brecinosrivas/mdf_reader.git
.. _git: https://git-scm.com/book/en/v2/Getting-Started-Installing-Git
2. Install a python environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For this you can use and install `pyenv <https://github.com/pyenv/pyenv>`_ and create a new virtual environment
with a the python version needed (**3.7.3**) using `pyenv-virtualenv <https://github.com/pyenv/pyenv-virtualenv>`_.
If you install pyenv and pyenv-virtualenv you can create an environment with a fix python version::
$ pyenv install 3.7.3
$ pyenv virtualenv 3.7.3 mdfreader_env
$ pyenv activate mdfreader_env
As another option you can use conda. See the `conda docs <https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands>`_
for more information about how to create an environment from the command line.
Or you can do what I usually do (much faster), install `mamba <https://github.com/mamba-org/mamba>`_.
3. Install dependencies
~~~~~~~~~~~~~~~~~~~~~~~
If you used **pyenv** for your environment, once activated you can install the dependencies using `pip <https://pip.pypa.io/en/stable/>`_::
$ pip install numpy==1.16.2 pandas==0.24.2 matplotlib==3.0.3
Check the conda or mamba documentation to install dependencies via those tools.
.. warning:: **The pandas version is particularly important since needs to be compatible with the way of importing the json module used in the code.**
4. Optional step: install jupyter notebook
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Install `jupyter notebook <https://jupyter.org/install>`_ and `IPython <https://jupyter.readthedocs.io/en/latest/install.html>`_ for an easy overview of the tool and to make use of the tutorials under ``~/mdf_reader/docs/notebooks``::
$ pip install notebook
$ pip install ipykernel
Check the libraries documentation in the links above to install them via conda or mamba.
Add a new kernel to load your notebooks with the right environment (``mdfreader_env``) run::
$ python -m ipykernel install --user --name=mdfreader_env
$ jupyter notebook
When you open the notebook, make sure you select the kernel or environment with the name ``mdfreader_env``. You can also
test the notebook by adding and executing the following code in a jupyter-notebook cell::
from platform import python_version
import sys
print(python_version())
print(sys.executable)
print(sys.version)
print(sys.version_info)
And you should see the following information for your ``mdfreader_env``::
/Users/username/.pyenv/versions/3.7.3/envs/mdfreader_env/bin/python
3.7.3 (default, Feb 4 2021, 14:32:54)
[Clang 12.0.0 (clang-1200.0.32.28)]
sys.version_info(major=3, minor=7, micro=3, releaselevel='final', serial=0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment