Commit dc6a92f7 authored by iregon's avatar iregon
Browse files

New comments

parent 5578e606
...@@ -3,7 +3,14 @@ ...@@ -3,7 +3,14 @@
""" """
Created on Thu Sep 13 15:14:51 2018 Created on Thu Sep 13 15:14:51 2018
Read data file format json schema to dictionary .read_schema: read data model json schema to dictionary
.df_schema: create a simple version of the schema reflecting only relevant attributes
of the data elements after being read into a dataframe
.templates: get list of available schema file templates
.copy_templates: get a copy of a schema file template
""" """
...@@ -29,7 +36,8 @@ templates_path = os.path.join(schema_lib,'templates','schemas') ...@@ -29,7 +36,8 @@ templates_path = os.path.join(schema_lib,'templates','schemas')
def read_schema(schema_name = None, ext_schema_path = None): def read_schema(schema_name = None, ext_schema_path = None):
# 1. Validate input
if schema_name: if schema_name:
if schema_name not in properties.supported_data_models: if schema_name not in properties.supported_data_models:
print('ERROR: \n\tInput data model "{}" not supported. See mdf_reader.properties.supported_data_models for supported data models'.format(schema_name)) print('ERROR: \n\tInput data model "{}" not supported. See mdf_reader.properties.supported_data_models for supported data models'.format(schema_name))
...@@ -44,24 +52,27 @@ def read_schema(schema_name = None, ext_schema_path = None): ...@@ -44,24 +52,27 @@ def read_schema(schema_name = None, ext_schema_path = None):
if not os.path.isfile(schema_file): if not os.path.isfile(schema_file):
logging.error('Can\'t find input schema file {}'.format(schema_file)) logging.error('Can\'t find input schema file {}'.format(schema_file))
return return
# 2. Get schema
with open(schema_file) as fileObj: with open(schema_file) as fileObj:
schema = json.load(fileObj) schema = json.load(fileObj)
# 3. Expand schema
# Fill in the initial schema to "full complexity": to homogeneize schema,
# explicitly add info that is implicit to given situations/data models
# ---------------------------------------------------------------------------
# FILL IN THE INITIAL SCHEMA TO "FULL COMPLEXITY" TO HOMOGEINIZE
# EXPLICITY ADD INFO THAT IS IMPLICIT TO GIVEN SITUATIONS/SUBFORMATS
# ---------------------------------------------------------------------------
# One report per record: make sure later changes are reflected in MULTIPLE # One report per record: make sure later changes are reflected in MULTIPLE
# REPORTS PER RECORD case below if we ever use it! # REPORTS PER RECORD case below if we ever use it!
# Currently only suppoerted case: one report per record (line) # Currently only supported case: one report per record (line)
# First check for no header case: sequential sections # 3.1. First check for no header case: sequential sections
if not schema['header']: if not schema['header']:
if not schema['sections']: if not schema['sections']:
logging.error('\'sections\' block needs to be defined in a schema with no header. Error in data model schema file {}'.format(schema_file)) logging.error('\'sections\' block needs to be defined in a schema with no header. Error in data model schema file {}'.format(schema_file))
return return
schema['header'] = dict() schema['header'] = dict()
if not schema['header'].get('multiple_reports_per_line'): if not schema['header'].get('multiple_reports_per_line'):
# Make no section formats be 1 section format # 3.2. Make no section formats be internally treated as 1 section format
if not schema.get('sections'): if not schema.get('sections'):
if not schema.get('elements'): if not schema.get('elements'):
logging.error('Data elements not defined in data model schema file {} under key \'elements\' '.format(schema_file)) logging.error('Data elements not defined in data model schema file {} under key \'elements\' '.format(schema_file))
...@@ -73,10 +84,10 @@ def read_schema(schema_name = None, ext_schema_path = None): ...@@ -73,10 +84,10 @@ def read_schema(schema_name = None, ext_schema_path = None):
schema['header'].pop('delimiter',None) schema['header'].pop('delimiter',None)
schema['sections'][properties.dummy_level]['header']['field_layout'] = schema['header'].get('field_layout') schema['sections'][properties.dummy_level]['header']['field_layout'] = schema['header'].get('field_layout')
schema['header'].pop('field_layout',None) schema['header'].pop('field_layout',None)
# Make parsing order explicit # 3.3. Make parsing order explicit
if not schema['header'].get('parsing_order'):# assume sequential if not schema['header'].get('parsing_order'):# assume sequential
schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())}] schema['header']['parsing_order'] = [{'s':list(schema['sections'].keys())}]
# Make disable_read and field_layout explicit: this is ruled by delimiter or length being set, # 3.4. Make disable_read and field_layout explicit: this is ruled by delimiter being set,
# unless explicitly set # unless explicitly set
for section in schema['sections'].keys(): for section in schema['sections'].keys():
if schema['sections'][section]['header'].get('disable_read'): if schema['sections'][section]['header'].get('disable_read'):
...@@ -138,7 +149,6 @@ def df_schema(df_columns, schema): ...@@ -138,7 +149,6 @@ def df_schema(df_columns, schema):
return flat_schema return flat_schema
def templates(): def templates():
schemas = glob.glob(os.path.join(templates_path,'*.json')) schemas = glob.glob(os.path.join(templates_path,'*.json'))
return [ os.path.basename(x).split(".")[0] for x in schemas ] return [ os.path.basename(x).split(".")[0] for x in schemas ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment