read.py 11.7 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
iregon's avatar
iregon committed
4

5 6 7 8 9 10 11
Manages the integral sequence in data file reading
from a data model:
    - Access to data model
    - Data file import
    - Data file reading
    - Data validation
    - Output
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
12

13 14 15
Contains the following functions:
    * ERV - does the actual extraction, read and validation of data input data
    * main - the main function of the script
iregon's avatar
iregon committed
16 17 18
    
Can be run as a script with:
    python -m mdf_reader data_file **kwargs
19
        
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
20
"""
21

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
22 23 24
import os
import sys
import pandas as pd
iregon's avatar
iregon committed
25 26
import logging
import json
iregon's avatar
iregon committed
27 28
import copy
from io import StringIO as StringIO
iregon's avatar
iregon committed
29
import csv
iregon's avatar
iregon committed
30

31
from .data_models import schemas
iregon's avatar
Cleaned  
iregon committed
32 33
from . import properties
from .common import pandas_TextParser_hdlr
34 35 36
#from .reader import import_data
#from .reader import get_sections
from mdf_reader.reader import import_data, get_sections, read_sections
37
from .validator import validate
iregon's avatar
iregon committed
38

iregon's avatar
iregon committed
39
toolPath = os.path.dirname(os.path.abspath(__file__))
40
schema_lib = os.path.join(toolPath,'data_models','lib')
iregon's avatar
iregon committed
41

42
# AUX FUNCTIONS ---------------------------------------------------------------
iregon's avatar
iregon committed
43
def ERV(TextParser,read_sections_list, schema, code_tables_path):
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
    """
    
    Extracts, reads and validates data input data.

    
    Parameters
    ----------
    TextParser : list or pandas.io.parsers.TextFileReader
        The data to extract and read
    read_sections_list : list
        List with subset of data model sections to output 
    schema : dict
        Data model schema
    code_tables_path : str
        Path to data model code tables

iregon's avatar
Cleaned  
iregon committed
60

61 62 63 64 65 66 67 68 69
    Returns
    -------
    data : pandas.DataFrame, pandas.io.parsers.TextFileReader
        Contains the input data extracted and read
    valid : pandas.DataFrame, pandas.io.parsers.TextFileReader
        Contains the a boolean mask with the data validation output
        
    """
    
iregon's avatar
iregon committed
70 71 72 73
    data_buffer = StringIO()
    valid_buffer = StringIO()

    for i_chunk, string_df in enumerate(TextParser):
iregon's avatar
iregon committed
74 75
        
        # 1. Get a DF with 1 column per sections:
iregon's avatar
iregon committed
76 77
        # - only sections requested, ignore rest
        # - requested NA sections as NaN columns
iregon's avatar
iregon committed
78 79
        # - columns(sections) order as in read_sections_list
        
80
        sections_df = get_sections.main(string_df, schema, read_sections_list)
81 82 83 84
        # 2. Read elements from sections
        # Along data chunks, resulting data types
        # may vary if gaps, keep track of data dtypes: v1.0
        # This has now been solved by working with Intxx pandas dtypes (nullable integers) 
iregon's avatar
iregon committed
85 86
        # Sections are parsed in the same order as sections_df.columns
        
87
        [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema)
iregon's avatar
iregon committed
88 89
        # 3. Validate data elements
        
iregon's avatar
Cleaned  
iregon committed
90
        valid_df = validate.validate(data_df, valid_df, schema, code_tables_path)
iregon's avatar
iregon committed
91 92 93 94
        # 4. Save to buffer        
        # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
        #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
iregon's avatar
iregon committed
95
        valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
iregon's avatar
iregon committed
96
        
iregon's avatar
Cleaned  
iregon committed
97
    # Create the output
iregon's avatar
iregon committed
98
    # WE'LL NEED TO POSPROCESS THIS WHEN READING MULTIPLE REPORTS PER LINE, IF EVER...
iregon's avatar
iregon committed
99 100 101 102 103 104 105
    data_buffer.seek(0)
    valid_buffer.seek(0)
    logging.info("Wrapping output....")
    # Chunksize from the imported TextParser if it is a pd.io.parsers.TextFileReader
    # (source is either pd.io.parsers.TextFileReader or a file with chunksize specified on input):
    # This way it supports direct chunksize property inheritance if the input source was a pd.io.parsers.TextFileReader
    chunksize = TextParser.orig_options['chunksize'] if isinstance(TextParser,pd.io.parsers.TextFileReader) else None
iregon's avatar
iregon committed
106
    
iregon's avatar
Cleaned  
iregon committed
107
    # 'datetime' is not a valid pandas dtype: Only on output (on reading) will be then converted (via parse_dates) to datetime64[ns] type,
108
    # cannot specify 'datetime' (of any kind) here: would fail, need to change to 'object' and tell the date parser where it is
iregon's avatar
iregon committed
109 110 111 112
    date_columns = [] # Needs to be the numeric index of the column, as seems not to be able to work with tupples....
    for i,element in enumerate(list(out_dtypes)):
        if out_dtypes.get(element) == 'datetime':
            date_columns.append(i)
113
            out_dtypes.update({element:'object'})
iregon's avatar
Cleaned  
iregon committed
114

iregon's avatar
iregon committed
115
    data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter)
iregon's avatar
iregon committed
116
    valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)
iregon's avatar
Cleaned  
iregon committed
117

iregon's avatar
iregon committed
118 119
    return data, valid

iregon's avatar
iregon committed
120
def validate_arg(arg_name,arg_value,arg_type):
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
    """
    
    Validates input argument is as expected type
    
    Parameters
    ----------
    arg_name : str
    arg_value : arg_type
    arg_type : python type

    Returns
    -------
    True,False
        
    """
    
iregon's avatar
iregon committed
137
    if arg_value and not isinstance(arg_value,arg_type):
iregon's avatar
Cleaned  
iregon committed
138
        logging.error('Argument {0} must be {1}, input type is {2}'.format(arg_name,arg_type,type(arg_value)))
iregon's avatar
iregon committed
139 140 141
        return False
    else:
        return True
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
142

iregon's avatar
iregon committed
143
def validate_path(arg_name,arg_value):
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
    """
    
    Validates input argument is an existing directory
    
    Parameters
    ----------
    arg_name : str
    arg_value : str

    Returns
    -------
    True,False
        
    """
    
iregon's avatar
iregon committed
159 160 161 162 163 164
    if arg_value and not os.path.isdir(arg_value):
        logging.error('{0} could not find path {1}'.format(arg_name,arg_value))
        return False
    else:
        return True

165 166 167
# END AUX FUNCTIONS -----------------------------------------------------------
        

168
def main(source, data_model = None, data_model_path = None, sections = None,chunksize = None,
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
169
         skiprows = None, out_path = None ):
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
    """
    
    Reads a data file to a pandas DataFrame using a pre-defined data model.
    Read data is validates against its data model producing a boolean mask
    on output.
    
    The data model needs to be input to the module as a named model 
    (included in the module) or as the path to a valid data model.
    
    Arguments
    ---------
    source : str
        The file path to read
        
    Keyword Arguments
    -----------------
    data_model : str, optional
        Name of internally available data model 
    data_model_path : str, optional
        Path to external data model 
    sections : list, optional
        List with subset of data model sections to outpu (default is
        all)
    chunksize : int, optional
        Number of reports per chunk (default is
        no chunking)    
    skiprows : int, optional
        Number of initial rows to skip from file (default is 0)
    out_path : str, optional
        Path to output data, valid mask and attributes (default is
        no output)
    
    Returns
    -------
    output : object
        Attributes data, mask and atts contain the corresponding
        information from the data file.
       
    Note
    ----
    
    This module can also be run as a script, with the keyword arguments
    as name_arg=arg
    
    
        
    
    """
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
218 219 220
    logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s',
                    level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None)

iregon's avatar
iregon committed
221
    # 0. VALIDATE INPUT
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
222 223 224
    if not data_model and not data_model_path:
        logging.error('A valid data model name or path to data model must be provided')
        return
225 226 227
    if not os.path.isfile(source):
        logging.error('Can\'t find input data file {}'.format(source))
        return
iregon's avatar
iregon committed
228 229 230
    if not validate_arg('sections',sections,list):
        return
    if not validate_arg('chunksize',chunksize,int):
iregon's avatar
Cleaned  
iregon committed
231
        return
iregon's avatar
iregon committed
232
    if not validate_arg('skiprows',skiprows,int):
iregon's avatar
Cleaned  
iregon committed
233
        return
iregon's avatar
iregon committed
234 235 236 237
    if not validate_path('data_model_path',data_model_path):
        return
    if not validate_path('out_path',out_path):
        return
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
238

iregon's avatar
iregon committed
239
    # 1. GET DATA MODEL
iregon's avatar
iregon committed
240 241 242
    # Schema reader will return empty if cannot read schema or is not valid
    # and will log the corresponding error
    # multiple_reports_per_line error also while reading schema
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
243 244 245 246
    logging.info("READING DATA MODEL SCHEMA FILE...")
    schema = schemas.read_schema( schema_name = data_model, ext_schema_path = data_model_path)
    if not schema:
        return
iregon's avatar
iregon committed
247 248 249
    if data_model:
        model_path = os.path.join(schema_lib,data_model)
    else:
iregon's avatar
Cleaned  
iregon committed
250
        model_path = data_model_path
iregon's avatar
iregon committed
251
    code_tables_path = os.path.join(model_path,'code_tables')
iregon's avatar
Cleaned  
iregon committed
252 253


iregon's avatar
iregon committed
254
    # 2. READ AND VALIDATE DATA
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
255
    imodel = data_model if data_model else data_model_path
iregon's avatar
Cleaned  
iregon committed
256 257 258
    logging.info("EXTRACTING DATA FROM MODEL: {}".format(imodel))

    # 2.1. Subset data model sections to requested sections
iregon's avatar
iregon committed
259 260 261 262 263
    parsing_order = schema['header'].get('parsing_order')
    if not sections:
        sections = [ x.get(y) for x in parsing_order for y in x ]
        read_sections_list = [y for x in sections for y in x]
    else:
iregon's avatar
Cleaned  
iregon committed
264 265
        read_sections_list = sections

iregon's avatar
iregon committed
266 267 268
    # 2.2 Homogeneize input data to an iterable with dataframes:
    # a list with a single dataframe or a pd.io.parsers.TextFileReader
    logging.info("Getting data string from source...")
269
    TextParser = import_data.main(source, chunksize = chunksize, skiprows = skiprows)
iregon's avatar
Cleaned  
iregon committed
270

iregon's avatar
iregon committed
271 272 273
    # 2.3. Extract, read and validate data in same loop
    logging.info("Extracting and reading sections")
    data,valid = ERV(TextParser,read_sections_list, schema, code_tables_path)
iregon's avatar
Cleaned  
iregon committed
274

iregon's avatar
iregon committed
275
    # 3. CREATE OUTPUT DATA ATTRIBUTES 
iregon's avatar
Cleaned  
iregon committed
276
    logging.info("CREATING OUTPUT DATA ATTRIBUTES FROM DATA MODEL")
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
277
    data_columns = [ x for x in data ] if isinstance(data,pd.DataFrame) else data.orig_options['names']
iregon's avatar
iregon committed
278
    out_atts = schemas.df_schema(data_columns, schema)
iregon's avatar
iregon committed
279

iregon's avatar
iregon committed
280
    # 4. OUTPUT TO FILES IF REQUESTED
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
281
    if out_path:
iregon's avatar
iregon committed
282 283 284 285 286
        enlisted = False
        if not isinstance(data,pd.io.parsers.TextFileReader):
            data = [data]
            valid = [valid]
            enlisted = True
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
287
        logging.info('WRITING DATA TO FILES IN: {}'.format(out_path))
iregon's avatar
iregon committed
288 289 290 291 292 293 294 295 296 297 298 299 300 301

        for i, (data_df,valid_df) in enumerate(zip(data,valid)):
            header = False
            mode = 'a'
            if i == 0:
                mode = 'w'
                cols = [ x for x in data_df ]
                if isinstance(cols[0],tuple):
                    header = [":".join(x) for x in cols]
                    out_atts_json = { ":".join(x):out_atts.get(x) for x in out_atts.keys() }
                else:
                    header = cols
                    out_atts_json = out_atts
            data_df.to_csv(os.path.join(out_path,'data.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
iregon's avatar
iregon committed
302
            valid_df.to_csv(os.path.join(out_path,'mask.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
iregon's avatar
iregon committed
303 304 305
        if enlisted:
            data = data[0]
            valid = valid[0]
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
306
        else:
iregon's avatar
iregon committed
307 308
            data = pandas_TextParser_hdlr.restore(data.f,data.orig_options)
            valid = pandas_TextParser_hdlr.restore(valid.f,valid.orig_options)
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
309 310
        with open(os.path.join(out_path,'atts.json'),'w') as fileObj:
            json.dump(out_atts_json,fileObj,indent=4)
iregon's avatar
iregon committed
311

iregon's avatar
iregon committed
312
    # 5. RETURN DATA
313
    class output():
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
        """ Class to represent reader output
    
    
        Attributes
        ----------
        data : str
            a pandas.DataFrame or pandas.io.parsers.TextFileReader
            with the output data
        atts : dict
            a dictionary with the output data elements attributes
        mask : str
            a pandas.DataFrame or pandas.io.parsers.TextFileReader
            with the output data validation mask
        
        """
        
330 331 332 333 334 335 336
        def __init__(self):
            self.data = data
            self.atts = out_atts
            self.mask = valid
        
    
    return output()
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
337