From df5666640e9e36db32897f47b81e56f283e1caa3 Mon Sep 17 00:00:00 2001 From: perezgonzalez-irene <iregon@noc.ac.uk> Date: Thu, 27 Feb 2020 08:51:42 +0000 Subject: [PATCH] Adpated to renamed schemas package (data_models) --- read.py | 158 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 139 insertions(+), 19 deletions(-) diff --git a/read.py b/read.py index f5938a1..7bfc50a 100644 --- a/read.py +++ b/read.py @@ -1,21 +1,22 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Created on Tue Apr 30 09:38:17 2019 + +Manages the integral sequence in data file reading +from a data model: + - Access to data model + - Data file import + - Data file reading + - Data validation + - Output -Reads a data file to a pandas DataFrame using a pre-defined data model. +Contains the following functions: -The data model needs to be input to the module as a named model (included in the module) or as the path to a valid data model. - -Data elements are validated against its data model after reading, producing a boolean mask. - -Uses submodules: -- schemas -- reader -- valiate - -@author: iregon + * ERV - does the actual extraction, read and validation of data input data + * main - the main function of the script + """ + import os import sys import pandas as pd @@ -24,20 +25,45 @@ import json import copy from io import StringIO as StringIO -from . import schemas +from .data_models import schemas from . import properties from .common import pandas_TextParser_hdlr from .reader import import_data from .reader import get_sections -from .reader import read_sections +from .reader.read_sections import main as read_sections from .validate import validate toolPath = os.path.dirname(os.path.abspath(__file__)) -schema_lib = os.path.join(toolPath,'schemas','lib') +schema_lib = os.path.join(toolPath,'data_models','lib') # AUX FUNCTIONS --------------------------------------------------------------- def ERV(TextParser,read_sections_list, schema, code_tables_path): + """ + + Extracts, reads and validates data input data. + + + Parameters + ---------- + TextParser : list or pandas.io.parsers.TextFileReader + The data to extract and read + read_sections_list : list + List with subset of data model sections to output + schema : dict + Data model schema + code_tables_path : str + Path to data model code tables + + Returns + ------- + data : pandas.DataFrame, pandas.io.parsers.TextFileReader + Contains the input data extracted and read + valid : pandas.DataFrame, pandas.io.parsers.TextFileReader + Contains the a boolean mask with the data validation output + + """ + data_buffer = StringIO() valid_buffer = StringIO() @@ -54,7 +80,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): # may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this! # Sections are parsed in the same order as sections_df.columns - [data_df, valid_df, out_dtypesi ] = read_sections.read_sections(sections_df, schema) + [data_df, valid_df, out_dtypesi ] = read_sections(sections_df, schema) if i_chunk == 0: out_dtypes = copy.deepcopy(out_dtypesi) @@ -94,6 +120,22 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): return data, valid def validate_arg(arg_name,arg_value,arg_type): + """ + + Validates input argument is as expected type + + Parameters + ---------- + arg_name : str + arg_value : arg_type + arg_type : python type + + Returns + ------- + True,False + + """ + if arg_value and not isinstance(arg_value,arg_type): logging.error('Argument {0} must be {1}, input type is {2}'.format(arg_name,arg_type,type(arg_value))) return False @@ -101,6 +143,21 @@ def validate_arg(arg_name,arg_value,arg_type): return True def validate_path(arg_name,arg_value): + """ + + Validates input argument is an existing directory + + Parameters + ---------- + arg_name : str + arg_value : str + + Returns + ------- + True,False + + """ + if arg_value and not os.path.isdir(arg_value): logging.error('{0} could not find path {1}'.format(arg_name,arg_value)) return False @@ -110,9 +167,56 @@ def validate_path(arg_name,arg_value): # END AUX FUNCTIONS ----------------------------------------------------------- -def read(source, data_model = None, data_model_path = None, sections = None,chunksize = None, +def main(source, data_model = None, data_model_path = None, sections = None,chunksize = None, skiprows = None, out_path = None ): - + """ + + Reads a data file to a pandas DataFrame using a pre-defined data model. + Read data is validates against its data model producing a boolean mask + on output. + + The data model needs to be input to the module as a named model + (included in the module) or as the path to a valid data model. + + Arguments + --------- + source : str + The file path to read + + Keyword Arguments + ----------------- + data_model : str, optional + Name of internally available data model + data_model_path : str, optional + Path to external data model + sections : list, optional + List with subset of data model sections to outpu (default is + all) + chunksize : int, optional + Number of reports per chunk (default is + no chunking) + skiprows : int, optional + Number of initial rows to skip from file (default is 0) + out_path : str, optional + Path to output data, valid mask and attributes (default is + no output) + + Returns + ------- + output : object + Attributes data, mask and atts contain the corresponding + information from the data file. + + Note + ---- + + This module can also be run as a script, with the keyword arguments + as name_arg=arg + + + + + """ logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s', level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None) @@ -209,6 +313,22 @@ def read(source, data_model = None, data_model_path = None, sections = None,chun # 5. RETURN DATA class output(): + """ Class to represent reader output + + + Attributes + ---------- + data : str + a pandas.DataFrame or pandas.io.parsers.TextFileReader + with the output data + atts : dict + a dictionary with the output data elements attributes + mask : str + a pandas.DataFrame or pandas.io.parsers.TextFileReader + with the output data validation mask + + """ + def __init__(self): self.data = data self.atts = out_atts @@ -221,4 +341,4 @@ if __name__=='__main__': kwargs = dict(arg.split('=') for arg in sys.argv[2:]) if 'sections' in kwargs.keys(): kwargs.update({ 'sections': [ x.strip() for x in kwargs.get('sections').split(",")] }) - read(sys.argv[1], **kwargs) # kwargs + main(sys.argv[1], **kwargs) # kwargs -- GitLab