schemas.py 6.74 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3 4 5 6 7
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 13 15:14:51 2018


Read data file format json schema to dictionary
iregon's avatar
iregon committed
8 9 10 11

Add schema validation:
    - check mandatory are not null
    - check fixed options
iregon's avatar
iregon committed
12
..and return None if it does not validate
iregon's avatar
iregon committed
13

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
14 15 16 17 18 19 20 21 22 23
"""


import os
import sys
import json
import logging
import shutil
from copy import deepcopy
import glob
iregon's avatar
iregon committed
24

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
from .. import properties

if sys.version_info[0] >= 3:
    py3 = True
else:
    py3 = False


toolPath = os.path.dirname(os.path.abspath(__file__))
schema_lib = os.path.join(toolPath,'lib')
templates_path = os.path.join(schema_lib,'templates','schemas')

def templates():
    schemas = glob.glob(os.path.join(templates_path,'*.json'))
    return [ os.path.basename(x).split(".")[0] for x in schemas ]

def copy_template(schema, out_dir = None,out_path = None):
    schemas = templates()
    if schema in schemas:
        schema_path = os.path.join(templates_path,schema + '.json')
        schema_out = out_path if out_path else os.path.join(out_dir,schema + '.json')
        shutil.copyfile(schema_path,  schema_out)
        if os.path.isfile( schema_out):
            print('Schema template {0} copied to {1}'.format(schema, schema_out))
            return
        else:
            print('copy_template ERROR:')
            print('\tError copying schema template {0} copied to {1}'.format(schema, schema_out))
            return
    else:
        print('copy_template ERROR:')
        print('\tRequested template {} must be a valid name.'.format(schema))
        print('\tValid names are: {}'.format(", ".join(schemas)))
        return

def read_schema(schema_name = None, ext_schema_path = None):

    if schema_name:
        if schema_name not in properties.supported_file_formats:
            print('ERROR: \n\tInput schema "{}" not supported. See mdf_reader.properties.supported_file_formats for supported file format schemas'.format(schema_name))
            return
        else:
            schema_path = os.path.join(schema_lib,schema_name)
    else:
        schema_path = os.path.abspath(ext_schema_path)
        schema_name = os.path.basename(schema_path)
iregon's avatar
iregon committed
71
        
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
72 73 74 75 76 77 78 79 80 81
    schema_file = os.path.join(schema_path, schema_name + '.json')
    if not os.path.isfile(schema_file):
        logging.error('Can\'t find input schema file {}'.format(schema_file))
        return
    with open(schema_file) as fileObj:
        schema = json.load(fileObj)
    #   ---------------------------------------------------------------------------
    #   FILL IN THE INITIAL SCHEMA TO "FULL COMPLEXITY"
    #   EXPLICITY ADD INFO THAT IS IMPLICIT TO GIVEN SITUATIONS/SUBFORMATS
    #   ---------------------------------------------------------------------------
iregon's avatar
iregon committed
82 83
    # One report per record: make sure later changes are reflected in MULTIPLE
    # REPORTS PER RECORD case below if we ever use it!
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
84
    if not schema['header'].get('multiple_reports_per_line'):
iregon's avatar
iregon committed
85
        # Make no section formats be 1 section format
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
86 87 88 89
        if not schema.get('sections'):
            schema['sections'] = {properties.dummy_level:{'header':{},'elements':schema.get('elements')}}
            schema['header']['parsing_order'] = [{'s':[properties.dummy_level]}]
            schema.pop('elements',None)
iregon's avatar
iregon committed
90 91 92 93 94
            schema['sections'][properties.dummy_level]['header']['delimiter'] = schema['header'].get('delimiter')
            schema['header'].pop('delimiter',None)
            schema['sections'][properties.dummy_level]['header']['field_layout'] = schema['header'].get('field_layout')
            schema['header'].pop('field_layout',None)
        # Make parsing order explicit
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
95 96
        if not schema['header'].get('parsing_order'):# assume sequential
            schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())}]
iregon's avatar
iregon committed
97 98 99 100 101 102 103 104 105 106
        # Make disable_read and field_layout explicit: this is ruled by delimiter or length being set,
        # unless explicitly set
        for section in schema['sections'].keys():
            if schema['sections'][section]['header'].get('disable_read'):
                continue
            else:
                schema['sections'][section]['header']['disable_read'] = False
            if not schema['sections'][section]['header'].get('field_layout'):
                delimiter = schema['sections'][section]['header'].get('delimiter')
                schema['sections'][section]['header']['field_layout'] = 'delimited' if delimiter else 'fixed_width'   
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
        return schema
    else:
        # 1X: MULTIPLE REPORTS PER RECORD
        # !!!! NEED TO ADD SECTION LENS TO THE REPORT'S SECTION'S HEADER!!!
        # CAN INFER FROM ELEMENTS LENGHT AND ADD, OR MAKE AS REQUIREMENT TO BE GIVEN
        # global name_report_section
        # Have to assess how the section splitting works when x sequential
        # sections are declared, and only x-y are met.
        if not schema['header'].get('reports_per_line'):
            schema['header']['reports_per_line'] = 24
        if not schema.get('sections'):
            schema['sections'] = dict()
            schema['header']['parsing_order'] = [{'s':[]}]
            for i in range(1,schema['header']['reports_per_line'] + 1):
                schema['sections'].update({str(i):{'header':{},'elements':deepcopy(schema.get('elements'))}})
        else:
            name_report_section = list(schema['sections'].keys())[-1]
            schema['header']['name_report_section'] == name_report_section
            schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())[:-1]}]
            for i in range(1,schema['header']['reports_per_line'] + 1):
                schema['sections'].update({str(i):schema['sections'].get(name_report_section)})
            schema['sections'].pop(name_report_section,None)
        for i in range(1,schema['header']['reports_per_line'] + 1):
            schema['header']['parsing_order'][0]['s'].append(str(i))
        return schema

iregon's avatar
iregon committed
133 134
def df_schema(df_columns, schema):
    def clean_schema(columns,schema):
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
135 136 137 138 139 140 141 142 143 144 145 146
        # Could optionally add cleaning of element descriptors that only apply
        # to the initial reading of the data model: field_length, etc....
        for element in list(schema):
            if element not in columns:
                schema.pop(element)
        return

    flat_schema = dict()
    # Flatten main model schema
    for section in schema.get('sections'):
        if section == properties.dummy_level:
            flat_schema.update(schema['sections'].get(section).get('elements'))
147 148
        elif schema['sections'].get(section).get('header').get('disable_read'):
            flat_schema.update( { (section, section): {'column_type':'object'} })    
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
149 150 151
        else:
            flat_schema.update( { (section, x): schema['sections'].get(section).get('elements').get(x) for x in schema['sections'].get(section).get('elements') })

iregon's avatar
iregon committed
152 153
    clean_schema(df_columns, flat_schema)

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
154 155

    return flat_schema