schemas.py 7.3 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3 4 5 6
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 13 15:14:51 2018

Read data file format json schema to dictionary
iregon's avatar
iregon committed
7

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
8 9 10 11 12 13 14 15 16
"""

import os
import sys
import json
import logging
import shutil
from copy import deepcopy
import glob
iregon's avatar
iregon committed
17

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
from .. import properties

if sys.version_info[0] >= 3:
    py3 = True
else:
    py3 = False


toolPath = os.path.dirname(os.path.abspath(__file__))
schema_lib = os.path.join(toolPath,'lib')
templates_path = os.path.join(schema_lib,'templates','schemas')


def read_schema(schema_name = None, ext_schema_path = None):

    if schema_name:
34 35
        if schema_name not in properties.supported_data_models:
            print('ERROR: \n\tInput data model "{}" not supported. See mdf_reader.properties.supported_data_models for supported data models'.format(schema_name))
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
36 37 38 39 40 41
            return
        else:
            schema_path = os.path.join(schema_lib,schema_name)
    else:
        schema_path = os.path.abspath(ext_schema_path)
        schema_name = os.path.basename(schema_path)
42

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
43 44 45 46 47 48
    schema_file = os.path.join(schema_path, schema_name + '.json')
    if not os.path.isfile(schema_file):
        logging.error('Can\'t find input schema file {}'.format(schema_file))
        return
    with open(schema_file) as fileObj:
        schema = json.load(fileObj)
49

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
50
    #   ---------------------------------------------------------------------------
51
    #   FILL IN THE INITIAL SCHEMA TO "FULL COMPLEXITY" TO HOMOGEINIZE
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
52 53
    #   EXPLICITY ADD INFO THAT IS IMPLICIT TO GIVEN SITUATIONS/SUBFORMATS
    #   ---------------------------------------------------------------------------
iregon's avatar
iregon committed
54 55
    # One report per record: make sure later changes are reflected in MULTIPLE
    # REPORTS PER RECORD case below if we ever use it!
56 57 58 59 60 61 62
    # Currently only suppoerted case: one report per record (line)
    # First check for no header case: sequential sections
    if not schema['header']:
        if not schema['sections']:
            logging.error('\'sections\' block needs to be defined in a schema with no header. Error in data model schema file {}'.format(schema_file))
            return
        schema['header'] = dict()
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
63
    if not schema['header'].get('multiple_reports_per_line'):
iregon's avatar
iregon committed
64
        # Make no section formats be 1 section format
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
65
        if not schema.get('sections'):
66 67 68
            if not schema.get('elements'):
                logging.error('Data elements not defined in data model schema file {} under key \'elements\' '.format(schema_file))
                return
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
69 70 71
            schema['sections'] = {properties.dummy_level:{'header':{},'elements':schema.get('elements')}}
            schema['header']['parsing_order'] = [{'s':[properties.dummy_level]}]
            schema.pop('elements',None)
iregon's avatar
iregon committed
72 73 74 75 76
            schema['sections'][properties.dummy_level]['header']['delimiter'] = schema['header'].get('delimiter')
            schema['header'].pop('delimiter',None)
            schema['sections'][properties.dummy_level]['header']['field_layout'] = schema['header'].get('field_layout')
            schema['header'].pop('field_layout',None)
        # Make parsing order explicit
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
77 78
        if not schema['header'].get('parsing_order'):# assume sequential
            schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())}]
iregon's avatar
iregon committed
79 80 81 82 83 84 85 86 87
        # Make disable_read and field_layout explicit: this is ruled by delimiter or length being set,
        # unless explicitly set
        for section in schema['sections'].keys():
            if schema['sections'][section]['header'].get('disable_read'):
                continue
            else:
                schema['sections'][section]['header']['disable_read'] = False
            if not schema['sections'][section]['header'].get('field_layout'):
                delimiter = schema['sections'][section]['header'].get('delimiter')
88
                schema['sections'][section]['header']['field_layout'] = 'delimited' if delimiter else 'fixed_width'
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
89 90
        return schema
    else:
91 92
        logging.error('Multile reports per line data model: not yet supported')
        return
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
93 94 95 96 97 98
        # 1X: MULTIPLE REPORTS PER RECORD
        # !!!! NEED TO ADD SECTION LENS TO THE REPORT'S SECTION'S HEADER!!!
        # CAN INFER FROM ELEMENTS LENGHT AND ADD, OR MAKE AS REQUIREMENT TO BE GIVEN
        # global name_report_section
        # Have to assess how the section splitting works when x sequential
        # sections are declared, and only x-y are met.
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
        #if not schema['header'].get('reports_per_line'):
        #    schema['header']['reports_per_line'] = 24
        #if not schema.get('sections'):
        #    schema['sections'] = dict()
        #    schema['header']['parsing_order'] = [{'s':[]}]
        #    for i in range(1,schema['header']['reports_per_line'] + 1):
        #        schema['sections'].update({str(i):{'header':{},'elements':deepcopy(schema.get('elements'))}})
        #else:
        #    name_report_section = list(schema['sections'].keys())[-1]
        #    schema['header']['name_report_section'] == name_report_section
        #    schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())[:-1]}]
        #    for i in range(1,schema['header']['reports_per_line'] + 1):
        #        schema['sections'].update({str(i):schema['sections'].get(name_report_section)})
        #    schema['sections'].pop(name_report_section,None)
        #for i in range(1,schema['header']['reports_per_line'] + 1):
        #    schema['header']['parsing_order'][0]['s'].append(str(i))
        #return schema
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
116

iregon's avatar
iregon committed
117 118
def df_schema(df_columns, schema):
    def clean_schema(columns,schema):
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
119 120 121 122 123 124 125 126 127 128 129 130
        # Could optionally add cleaning of element descriptors that only apply
        # to the initial reading of the data model: field_length, etc....
        for element in list(schema):
            if element not in columns:
                schema.pop(element)
        return

    flat_schema = dict()
    # Flatten main model schema
    for section in schema.get('sections'):
        if section == properties.dummy_level:
            flat_schema.update(schema['sections'].get(section).get('elements'))
131
        elif schema['sections'].get(section).get('header').get('disable_read'):
132
            flat_schema.update( { (section, section): {'column_type':'object'} })
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
133 134 135
        else:
            flat_schema.update( { (section, x): schema['sections'].get(section).get('elements').get(x) for x in schema['sections'].get(section).get('elements') })

iregon's avatar
iregon committed
136 137
    clean_schema(df_columns, flat_schema)

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
138 139

    return flat_schema
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163


def templates():
    schemas = glob.glob(os.path.join(templates_path,'*.json'))
    return [ os.path.basename(x).split(".")[0] for x in schemas ]

def copy_template(schema, out_dir = None,out_path = None):
    schemas = templates()
    if schema in schemas:
        schema_path = os.path.join(templates_path,schema + '.json')
        schema_out = out_path if out_path else os.path.join(out_dir,schema + '.json')
        shutil.copyfile(schema_path,  schema_out)
        if os.path.isfile( schema_out):
            print('Schema template {0} copied to {1}'.format(schema, schema_out))
            return
        else:
            print('copy_template ERROR:')
            print('\tError copying schema template {0} copied to {1}'.format(schema, schema_out))
            return
    else:
        print('copy_template ERROR:')
        print('\tRequested template {} must be a valid name.'.format(schema))
        print('\tValid names are: {}'.format(", ".join(schemas)))
        return