schemas.py 9.23 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3 4
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

iregon's avatar
iregon committed
5 6 7
This module has functions to manage data model
schema files and objects according to the
requirements of the data reader tool
iregon's avatar
iregon committed
8

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
9 10 11 12 13 14 15 16 17
"""

import os
import sys
import json
import logging
import shutil
from copy import deepcopy
import glob
iregon's avatar
iregon committed
18

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
19 20 21 22 23 24 25 26 27 28 29 30 31 32
from .. import properties

if sys.version_info[0] >= 3:
    py3 = True
else:
    py3 = False


toolPath = os.path.dirname(os.path.abspath(__file__))
schema_lib = os.path.join(toolPath,'lib')
templates_path = os.path.join(schema_lib,'templates','schemas')


def read_schema(schema_name = None, ext_schema_path = None):
iregon's avatar
iregon committed
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
    """

    Reads a data model schema file to a dictionary and
    completes it by adding explicitly information the 
    reader tool needs
    
    Keyword Arguments
    -----------------
    schema_name : str, optional
        The name of data model to read. This is for
        data models included in the tool
    ext_schema_path : str, optional
        The path to the external data model schema file 


    Either schema_name or ext_schema_path must be provided.


    Returns
    -------
    dict
        Data model schema

    """
iregon's avatar
iregon committed
57 58
    
    # 1. Validate input
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
59
    if schema_name:
60 61
        if schema_name not in properties.supported_data_models:
            print('ERROR: \n\tInput data model "{}" not supported. See mdf_reader.properties.supported_data_models for supported data models'.format(schema_name))
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
62 63 64 65 66 67
            return
        else:
            schema_path = os.path.join(schema_lib,schema_name)
    else:
        schema_path = os.path.abspath(ext_schema_path)
        schema_name = os.path.basename(schema_path)
68

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
69 70 71 72
    schema_file = os.path.join(schema_path, schema_name + '.json')
    if not os.path.isfile(schema_file):
        logging.error('Can\'t find input schema file {}'.format(schema_file))
        return
iregon's avatar
iregon committed
73 74
    
    # 2. Get schema
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
75 76
    with open(schema_file) as fileObj:
        schema = json.load(fileObj)
iregon's avatar
iregon committed
77 78 79 80
        
    # 3. Expand schema
    # Fill in the initial schema to "full complexity": to homogeneize schema,
    # explicitly add info that is implicit to given situations/data models
81

iregon's avatar
iregon committed
82 83
    # One report per record: make sure later changes are reflected in MULTIPLE
    # REPORTS PER RECORD case below if we ever use it!
iregon's avatar
iregon committed
84 85
    # Currently only supported case: one report per record (line)
    # 3.1. First check for no header case: sequential sections
86 87 88 89 90
    if not schema['header']:
        if not schema['sections']:
            logging.error('\'sections\' block needs to be defined in a schema with no header. Error in data model schema file {}'.format(schema_file))
            return
        schema['header'] = dict()
iregon's avatar
iregon committed
91
        
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
92
    if not schema['header'].get('multiple_reports_per_line'):
iregon's avatar
iregon committed
93
        # 3.2. Make no section formats be internally treated as 1 section format
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
94
        if not schema.get('sections'):
95 96 97
            if not schema.get('elements'):
                logging.error('Data elements not defined in data model schema file {} under key \'elements\' '.format(schema_file))
                return
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
98 99 100
            schema['sections'] = {properties.dummy_level:{'header':{},'elements':schema.get('elements')}}
            schema['header']['parsing_order'] = [{'s':[properties.dummy_level]}]
            schema.pop('elements',None)
iregon's avatar
iregon committed
101 102 103 104
            schema['sections'][properties.dummy_level]['header']['delimiter'] = schema['header'].get('delimiter')
            schema['header'].pop('delimiter',None)
            schema['sections'][properties.dummy_level]['header']['field_layout'] = schema['header'].get('field_layout')
            schema['header'].pop('field_layout',None)
iregon's avatar
iregon committed
105
        # 3.3. Make parsing order explicit
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
106 107
        if not schema['header'].get('parsing_order'):# assume sequential
            schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())}]
iregon's avatar
iregon committed
108
        # 3.4. Make disable_read and field_layout explicit: this is ruled by delimiter being set,
iregon's avatar
iregon committed
109 110 111 112 113 114 115 116
        # unless explicitly set
        for section in schema['sections'].keys():
            if schema['sections'][section]['header'].get('disable_read'):
                continue
            else:
                schema['sections'][section]['header']['disable_read'] = False
            if not schema['sections'][section]['header'].get('field_layout'):
                delimiter = schema['sections'][section]['header'].get('delimiter')
117
                schema['sections'][section]['header']['field_layout'] = 'delimited' if delimiter else 'fixed_width'
118 119 120 121 122
            for element in schema['sections'][section]['elements'].keys():
                if schema['sections'][section]['elements'][element].get('column_type') in properties.numpy_integers:
                    np_integer = schema['sections'][section]['elements'][element].get('column_type') 
                    pd_integer = properties.pandas_nan_integers.get(np_integer)
                    schema['sections'][section]['elements'][element].update({'column_type':pd_integer})
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
123 124
        return schema
    else:
125 126
        logging.error('Multile reports per line data model: not yet supported')
        return
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
127 128 129 130 131 132
        # 1X: MULTIPLE REPORTS PER RECORD
        # !!!! NEED TO ADD SECTION LENS TO THE REPORT'S SECTION'S HEADER!!!
        # CAN INFER FROM ELEMENTS LENGHT AND ADD, OR MAKE AS REQUIREMENT TO BE GIVEN
        # global name_report_section
        # Have to assess how the section splitting works when x sequential
        # sections are declared, and only x-y are met.
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
        #if not schema['header'].get('reports_per_line'):
        #    schema['header']['reports_per_line'] = 24
        #if not schema.get('sections'):
        #    schema['sections'] = dict()
        #    schema['header']['parsing_order'] = [{'s':[]}]
        #    for i in range(1,schema['header']['reports_per_line'] + 1):
        #        schema['sections'].update({str(i):{'header':{},'elements':deepcopy(schema.get('elements'))}})
        #else:
        #    name_report_section = list(schema['sections'].keys())[-1]
        #    schema['header']['name_report_section'] == name_report_section
        #    schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())[:-1]}]
        #    for i in range(1,schema['header']['reports_per_line'] + 1):
        #        schema['sections'].update({str(i):schema['sections'].get(name_report_section)})
        #    schema['sections'].pop(name_report_section,None)
        #for i in range(1,schema['header']['reports_per_line'] + 1):
        #    schema['header']['parsing_order'][0]['s'].append(str(i))
        #return schema
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
150

iregon's avatar
iregon committed
151
def df_schema(df_columns, schema):
iregon's avatar
iregon committed
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
    """

    Creates a simple attribute dictionary for the elements
    in a dataframe from its data model schema
    
    Parameters
    ----------
    df_columns : list
        The columns in the data frame (data elements from
        the data model)
    schema : dict
        The data model schema


    Returns
    -------
    dict
        Data elements attributes

    """
iregon's avatar
iregon committed
172
    def clean_schema(columns,schema):
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
173 174 175 176 177 178 179 180 181 182 183 184
        # Could optionally add cleaning of element descriptors that only apply
        # to the initial reading of the data model: field_length, etc....
        for element in list(schema):
            if element not in columns:
                schema.pop(element)
        return

    flat_schema = dict()
    # Flatten main model schema
    for section in schema.get('sections'):
        if section == properties.dummy_level:
            flat_schema.update(schema['sections'].get(section).get('elements'))
185
        elif schema['sections'].get(section).get('header').get('disable_read'):
186
            flat_schema.update( { (section, section): {'column_type':'object'} })
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
187 188 189
        else:
            flat_schema.update( { (section, x): schema['sections'].get(section).get('elements').get(x) for x in schema['sections'].get(section).get('elements') })

iregon's avatar
iregon committed
190 191
    clean_schema(df_columns, flat_schema)

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
192 193

    return flat_schema
194 195

def templates():
iregon's avatar
iregon committed
196 197 198 199 200 201 202 203 204 205
    """

    Lists the name of the available schema file templates

    Returns
    -------
    list
        Schema file templates alias

    """
206 207 208 209
    schemas = glob.glob(os.path.join(templates_path,'*.json'))
    return [ os.path.basename(x).split(".")[0] for x in schemas ]

def copy_template(schema, out_dir = None,out_path = None):
iregon's avatar
iregon committed
210 211
    """

iregon's avatar
iregon committed
212 213
    Copies a schema file template to an output
    file or path
iregon's avatar
iregon committed
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
    
    Parameters
    ----------
    schema : str
        Schema template name to copy
        
    Keyword Arguments
    -----------------
    out_dir : dict, opt
        Directory to copy schema file template to
    out_path : dict, opt
        Full filename to copy schema file template to
    
    Either out_dir or out_path must be provided


    """
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
    schemas = templates()
    if schema in schemas:
        schema_path = os.path.join(templates_path,schema + '.json')
        schema_out = out_path if out_path else os.path.join(out_dir,schema + '.json')
        shutil.copyfile(schema_path,  schema_out)
        if os.path.isfile( schema_out):
            print('Schema template {0} copied to {1}'.format(schema, schema_out))
            return
        else:
            print('copy_template ERROR:')
            print('\tError copying schema template {0} copied to {1}'.format(schema, schema_out))
            return
    else:
        print('copy_template ERROR:')
        print('\tRequested template {} must be a valid name.'.format(schema))
        print('\tValid names are: {}'.format(", ".join(schemas)))
        return