read.py 11.6 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
4 5 6 7 8 9 10 11
 
Manages the integral sequence in data file reading
from a data model:
    - Access to data model
    - Data file import
    - Data file reading
    - Data validation
    - Output
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
12

13
Contains the following functions:
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
14

15 16
    * ERV - does the actual extraction, read and validation of data input data
    * main - the main function of the script
iregon's avatar
iregon committed
17 18 19
    
Can be run as a script with:
    python -m mdf_reader data_file **kwargs
20
        
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
21
"""
22

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
23 24 25
import os
import sys
import pandas as pd
iregon's avatar
iregon committed
26 27
import logging
import json
iregon's avatar
iregon committed
28 29
import copy
from io import StringIO as StringIO
iregon's avatar
iregon committed
30

31
from .data_models import schemas
iregon's avatar
Cleaned  
iregon committed
32 33
from . import properties
from .common import pandas_TextParser_hdlr
iregon's avatar
iregon committed
34 35
from .reader import import_data
from .reader import get_sections
36
from .reader.read_sections import main as read_sections
37
from .validator import validate
iregon's avatar
iregon committed
38

iregon's avatar
iregon committed
39
toolPath = os.path.dirname(os.path.abspath(__file__))
40
schema_lib = os.path.join(toolPath,'data_models','lib')
iregon's avatar
iregon committed
41

42
# AUX FUNCTIONS ---------------------------------------------------------------
iregon's avatar
iregon committed
43
def ERV(TextParser,read_sections_list, schema, code_tables_path):
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
    """
    
    Extracts, reads and validates data input data.

    
    Parameters
    ----------
    TextParser : list or pandas.io.parsers.TextFileReader
        The data to extract and read
    read_sections_list : list
        List with subset of data model sections to output 
    schema : dict
        Data model schema
    code_tables_path : str
        Path to data model code tables

iregon's avatar
Cleaned  
iregon committed
60

61 62 63 64 65 66 67 68 69
    Returns
    -------
    data : pandas.DataFrame, pandas.io.parsers.TextFileReader
        Contains the input data extracted and read
    valid : pandas.DataFrame, pandas.io.parsers.TextFileReader
        Contains the a boolean mask with the data validation output
        
    """
    
iregon's avatar
iregon committed
70 71 72 73
    data_buffer = StringIO()
    valid_buffer = StringIO()

    for i_chunk, string_df in enumerate(TextParser):
iregon's avatar
iregon committed
74 75
        
        # 1. Get a DF with 1 column per sections:
iregon's avatar
iregon committed
76 77
        # - only sections requested, ignore rest
        # - requested NA sections as NaN columns
iregon's avatar
iregon committed
78 79
        # - columns(sections) order as in read_sections_list
        
iregon's avatar
iregon committed
80
        sections_df = get_sections.get_sections(string_df, schema, read_sections_list)
iregon's avatar
Cleaned  
iregon committed
81

iregon's avatar
iregon committed
82 83 84 85
        # 2. Read elements from sections: along data chunks, resulting data types
        # may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this!
        # Sections are parsed in the same order as sections_df.columns
        
86
        [data_df, valid_df, out_dtypesi ] = read_sections(sections_df, schema)
iregon's avatar
iregon committed
87 88
        if i_chunk == 0:
            out_dtypes = copy.deepcopy(out_dtypesi)
iregon's avatar
Cleaned  
iregon committed
89 90

        for k in out_dtypesi:
iregon's avatar
iregon committed
91 92
            if out_dtypesi in properties.numpy_floats:
                out_dtypes.update({ k:out_dtypesi.get(k) })
iregon's avatar
iregon committed
93 94 95
        
        # 3. Validate data elements
        
iregon's avatar
Cleaned  
iregon committed
96
        valid_df = validate.validate(data_df, valid_df, schema, code_tables_path)
iregon's avatar
iregon committed
97 98
        
        # 4. Save to buffer
iregon's avatar
iregon committed
99 100
        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
        valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
iregon's avatar
iregon committed
101
        
iregon's avatar
Cleaned  
iregon committed
102
    # Create the output
iregon's avatar
iregon committed
103
    # WE'LL NEED TO POSPROCESS THIS WHEN READING MULTIPLE REPORTS PER LINE, IF EVER...
iregon's avatar
iregon committed
104 105 106 107 108 109 110
    data_buffer.seek(0)
    valid_buffer.seek(0)
    logging.info("Wrapping output....")
    # Chunksize from the imported TextParser if it is a pd.io.parsers.TextFileReader
    # (source is either pd.io.parsers.TextFileReader or a file with chunksize specified on input):
    # This way it supports direct chunksize property inheritance if the input source was a pd.io.parsers.TextFileReader
    chunksize = TextParser.orig_options['chunksize'] if isinstance(TextParser,pd.io.parsers.TextFileReader) else None
iregon's avatar
iregon committed
111
    
iregon's avatar
Cleaned  
iregon committed
112
    # 'datetime' is not a valid pandas dtype: Only on output (on reading) will be then converted (via parse_dates) to datetime64[ns] type,
113
    # cannot specify 'datetime' (of any kind) here: would fail, need to change to 'object' and tell the date parser where it is
iregon's avatar
iregon committed
114 115 116 117
    date_columns = [] # Needs to be the numeric index of the column, as seems not to be able to work with tupples....
    for i,element in enumerate(list(out_dtypes)):
        if out_dtypes.get(element) == 'datetime':
            date_columns.append(i)
118
            out_dtypes.update({element:'object'})
iregon's avatar
Cleaned  
iregon committed
119

iregon's avatar
iregon committed
120 121
    data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns)
    valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)
iregon's avatar
Cleaned  
iregon committed
122

iregon's avatar
iregon committed
123 124
    return data, valid

iregon's avatar
iregon committed
125
def validate_arg(arg_name,arg_value,arg_type):
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
    """
    
    Validates input argument is as expected type
    
    Parameters
    ----------
    arg_name : str
    arg_value : arg_type
    arg_type : python type

    Returns
    -------
    True,False
        
    """
    
iregon's avatar
iregon committed
142
    if arg_value and not isinstance(arg_value,arg_type):
iregon's avatar
Cleaned  
iregon committed
143
        logging.error('Argument {0} must be {1}, input type is {2}'.format(arg_name,arg_type,type(arg_value)))
iregon's avatar
iregon committed
144 145 146
        return False
    else:
        return True
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
147

iregon's avatar
iregon committed
148
def validate_path(arg_name,arg_value):
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
    """
    
    Validates input argument is an existing directory
    
    Parameters
    ----------
    arg_name : str
    arg_value : str

    Returns
    -------
    True,False
        
    """
    
iregon's avatar
iregon committed
164 165 166 167 168 169
    if arg_value and not os.path.isdir(arg_value):
        logging.error('{0} could not find path {1}'.format(arg_name,arg_value))
        return False
    else:
        return True

170 171 172
# END AUX FUNCTIONS -----------------------------------------------------------
        

173
def main(source, data_model = None, data_model_path = None, sections = None,chunksize = None,
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
174
         skiprows = None, out_path = None ):
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
    """
    
    Reads a data file to a pandas DataFrame using a pre-defined data model.
    Read data is validates against its data model producing a boolean mask
    on output.
    
    The data model needs to be input to the module as a named model 
    (included in the module) or as the path to a valid data model.
    
    Arguments
    ---------
    source : str
        The file path to read
        
    Keyword Arguments
    -----------------
    data_model : str, optional
        Name of internally available data model 
    data_model_path : str, optional
        Path to external data model 
    sections : list, optional
        List with subset of data model sections to outpu (default is
        all)
    chunksize : int, optional
        Number of reports per chunk (default is
        no chunking)    
    skiprows : int, optional
        Number of initial rows to skip from file (default is 0)
    out_path : str, optional
        Path to output data, valid mask and attributes (default is
        no output)
    
    Returns
    -------
    output : object
        Attributes data, mask and atts contain the corresponding
        information from the data file.
       
    Note
    ----
    
    This module can also be run as a script, with the keyword arguments
    as name_arg=arg
    
    
        
    
    """
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
223 224 225
    logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s',
                    level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None)

iregon's avatar
iregon committed
226
    # 0. VALIDATE INPUT
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
227 228 229
    if not data_model and not data_model_path:
        logging.error('A valid data model name or path to data model must be provided')
        return
230 231 232
    if not os.path.isfile(source):
        logging.error('Can\'t find input data file {}'.format(source))
        return
iregon's avatar
iregon committed
233 234 235
    if not validate_arg('sections',sections,list):
        return
    if not validate_arg('chunksize',chunksize,int):
iregon's avatar
Cleaned  
iregon committed
236
        return
iregon's avatar
iregon committed
237
    if not validate_arg('skiprows',skiprows,int):
iregon's avatar
Cleaned  
iregon committed
238
        return
iregon's avatar
iregon committed
239 240 241 242
    if not validate_path('data_model_path',data_model_path):
        return
    if not validate_path('out_path',out_path):
        return
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
243

iregon's avatar
iregon committed
244
    # 1. GET DATA MODEL
iregon's avatar
iregon committed
245 246 247
    # Schema reader will return empty if cannot read schema or is not valid
    # and will log the corresponding error
    # multiple_reports_per_line error also while reading schema
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
248 249 250 251
    logging.info("READING DATA MODEL SCHEMA FILE...")
    schema = schemas.read_schema( schema_name = data_model, ext_schema_path = data_model_path)
    if not schema:
        return
iregon's avatar
iregon committed
252 253 254
    if data_model:
        model_path = os.path.join(schema_lib,data_model)
    else:
iregon's avatar
Cleaned  
iregon committed
255
        model_path = data_model_path
iregon's avatar
iregon committed
256
    code_tables_path = os.path.join(model_path,'code_tables')
iregon's avatar
Cleaned  
iregon committed
257 258


iregon's avatar
iregon committed
259
    # 2. READ AND VALIDATE DATA
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
260
    imodel = data_model if data_model else data_model_path
iregon's avatar
Cleaned  
iregon committed
261 262 263
    logging.info("EXTRACTING DATA FROM MODEL: {}".format(imodel))

    # 2.1. Subset data model sections to requested sections
iregon's avatar
iregon committed
264 265 266 267 268
    parsing_order = schema['header'].get('parsing_order')
    if not sections:
        sections = [ x.get(y) for x in parsing_order for y in x ]
        read_sections_list = [y for x in sections for y in x]
    else:
iregon's avatar
Cleaned  
iregon committed
269 270
        read_sections_list = sections

iregon's avatar
iregon committed
271 272 273 274
    # 2.2 Homogeneize input data to an iterable with dataframes:
    # a list with a single dataframe or a pd.io.parsers.TextFileReader
    logging.info("Getting data string from source...")
    TextParser = import_data.import_data(source, chunksize = chunksize, skiprows = skiprows)
iregon's avatar
Cleaned  
iregon committed
275

iregon's avatar
iregon committed
276 277 278
    # 2.3. Extract, read and validate data in same loop
    logging.info("Extracting and reading sections")
    data,valid = ERV(TextParser,read_sections_list, schema, code_tables_path)
iregon's avatar
Cleaned  
iregon committed
279

iregon's avatar
iregon committed
280
    # 3. CREATE OUTPUT DATA ATTRIBUTES 
iregon's avatar
Cleaned  
iregon committed
281
    logging.info("CREATING OUTPUT DATA ATTRIBUTES FROM DATA MODEL")
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
282
    data_columns = [ x for x in data ] if isinstance(data,pd.DataFrame) else data.orig_options['names']
iregon's avatar
iregon committed
283
    out_atts = schemas.df_schema(data_columns, schema)
iregon's avatar
iregon committed
284

iregon's avatar
iregon committed
285
    # 4. OUTPUT TO FILES IF REQUESTED
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
286
    if out_path:
iregon's avatar
iregon committed
287 288 289 290 291
        enlisted = False
        if not isinstance(data,pd.io.parsers.TextFileReader):
            data = [data]
            valid = [valid]
            enlisted = True
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
292
        logging.info('WRITING DATA TO FILES IN: {}'.format(out_path))
iregon's avatar
iregon committed
293 294 295 296 297 298 299 300 301 302 303 304 305 306

        for i, (data_df,valid_df) in enumerate(zip(data,valid)):
            header = False
            mode = 'a'
            if i == 0:
                mode = 'w'
                cols = [ x for x in data_df ]
                if isinstance(cols[0],tuple):
                    header = [":".join(x) for x in cols]
                    out_atts_json = { ":".join(x):out_atts.get(x) for x in out_atts.keys() }
                else:
                    header = cols
                    out_atts_json = out_atts
            data_df.to_csv(os.path.join(out_path,'data.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
iregon's avatar
iregon committed
307
            valid_df.to_csv(os.path.join(out_path,'mask.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
iregon's avatar
iregon committed
308 309 310
        if enlisted:
            data = data[0]
            valid = valid[0]
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
311
        else:
iregon's avatar
iregon committed
312 313
            data = pandas_TextParser_hdlr.restore(data.f,data.orig_options)
            valid = pandas_TextParser_hdlr.restore(valid.f,valid.orig_options)
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
314 315
        with open(os.path.join(out_path,'atts.json'),'w') as fileObj:
            json.dump(out_atts_json,fileObj,indent=4)
iregon's avatar
iregon committed
316

iregon's avatar
iregon committed
317
    # 5. RETURN DATA
318
    class output():
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
        """ Class to represent reader output
    
    
        Attributes
        ----------
        data : str
            a pandas.DataFrame or pandas.io.parsers.TextFileReader
            with the output data
        atts : dict
            a dictionary with the output data elements attributes
        mask : str
            a pandas.DataFrame or pandas.io.parsers.TextFileReader
            with the output data validation mask
        
        """
        
335 336 337 338 339 340 341
        def __init__(self):
            self.data = data
            self.atts = out_atts
            self.mask = valid
        
    
    return output()
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
342