read.py 12 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
iregon's avatar
iregon committed
4

5 6 7 8 9 10 11
Manages the integral sequence in data file reading
from a data model:
    - Access to data model
    - Data file import
    - Data file reading
    - Data validation
    - Output
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
12

13 14 15
Contains the following functions:
    * ERV - does the actual extraction, read and validation of data input data
    * main - the main function of the script
iregon's avatar
iregon committed
16 17 18
    
Can be run as a script with:
    python -m mdf_reader data_file **kwargs
19
        
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
20
"""
21

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
22 23 24
import os
import sys
import pandas as pd
iregon's avatar
iregon committed
25 26
import logging
import json
iregon's avatar
iregon committed
27 28
import copy
from io import StringIO as StringIO
iregon's avatar
iregon committed
29
import csv
iregon's avatar
iregon committed
30

31
from .data_models import schemas
iregon's avatar
Cleaned  
iregon committed
32 33
from . import properties
from .common import pandas_TextParser_hdlr
34 35 36
#from .reader import import_data
#from .reader import get_sections
from mdf_reader.reader import import_data, get_sections, read_sections
37
from .validator import validate
iregon's avatar
iregon committed
38

iregon's avatar
iregon committed
39
toolPath = os.path.dirname(os.path.abspath(__file__))
40
schema_lib = os.path.join(toolPath,'data_models','lib')
iregon's avatar
iregon committed
41

42
# AUX FUNCTIONS ---------------------------------------------------------------
iregon's avatar
iregon committed
43
def ERV(TextParser,read_sections_list, schema, code_tables_path):
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
    """
    
    Extracts, reads and validates data input data.

    
    Parameters
    ----------
    TextParser : list or pandas.io.parsers.TextFileReader
        The data to extract and read
    read_sections_list : list
        List with subset of data model sections to output 
    schema : dict
        Data model schema
    code_tables_path : str
        Path to data model code tables

iregon's avatar
Cleaned  
iregon committed
60

61 62 63 64 65 66 67 68 69
    Returns
    -------
    data : pandas.DataFrame, pandas.io.parsers.TextFileReader
        Contains the input data extracted and read
    valid : pandas.DataFrame, pandas.io.parsers.TextFileReader
        Contains the a boolean mask with the data validation output
        
    """
    
iregon's avatar
iregon committed
70 71 72 73
    data_buffer = StringIO()
    valid_buffer = StringIO()

    for i_chunk, string_df in enumerate(TextParser):
iregon's avatar
iregon committed
74 75
        
        # 1. Get a DF with 1 column per sections:
iregon's avatar
iregon committed
76 77
        # - only sections requested, ignore rest
        # - requested NA sections as NaN columns
iregon's avatar
iregon committed
78 79
        # - columns(sections) order as in read_sections_list
        
80
        sections_df = get_sections.main(string_df, schema, read_sections_list)
81 82 83 84
        # 2. Read elements from sections
        # Along data chunks, resulting data types
        # may vary if gaps, keep track of data dtypes: v1.0
        # This has now been solved by working with Intxx pandas dtypes (nullable integers) 
iregon's avatar
iregon committed
85 86
        # Sections are parsed in the same order as sections_df.columns
        
87
        [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema)
iregon's avatar
iregon committed
88 89
        # 3. Validate data elements
        
iregon's avatar
Cleaned  
iregon committed
90
        valid_df = validate.validate(data_df, valid_df, schema, code_tables_path)
iregon's avatar
iregon committed
91 92 93
        # 4. Save to buffer        
        # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
        #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
iregon's avatar
iregon committed
94 95 96
        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',
                       index = False,quoting=csv.QUOTE_NONE, sep=properties.internal_delimiter,
                       quotechar='\0',escapechar='\0')
iregon's avatar
iregon committed
97
        valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
iregon's avatar
iregon committed
98
        
iregon's avatar
Cleaned  
iregon committed
99
    # Create the output
iregon's avatar
iregon committed
100
    # WE'LL NEED TO POSPROCESS THIS WHEN READING MULTIPLE REPORTS PER LINE, IF EVER...
iregon's avatar
iregon committed
101 102 103 104 105 106 107
    data_buffer.seek(0)
    valid_buffer.seek(0)
    logging.info("Wrapping output....")
    # Chunksize from the imported TextParser if it is a pd.io.parsers.TextFileReader
    # (source is either pd.io.parsers.TextFileReader or a file with chunksize specified on input):
    # This way it supports direct chunksize property inheritance if the input source was a pd.io.parsers.TextFileReader
    chunksize = TextParser.orig_options['chunksize'] if isinstance(TextParser,pd.io.parsers.TextFileReader) else None
iregon's avatar
iregon committed
108
    
iregon's avatar
Cleaned  
iregon committed
109
    # 'datetime' is not a valid pandas dtype: Only on output (on reading) will be then converted (via parse_dates) to datetime64[ns] type,
110
    # cannot specify 'datetime' (of any kind) here: would fail, need to change to 'object' and tell the date parser where it is
iregon's avatar
iregon committed
111 112 113 114
    date_columns = [] # Needs to be the numeric index of the column, as seems not to be able to work with tupples....
    for i,element in enumerate(list(out_dtypes)):
        if out_dtypes.get(element) == 'datetime':
            date_columns.append(i)
115
            out_dtypes.update({element:'object'})
iregon's avatar
Cleaned  
iregon committed
116

iregon's avatar
iregon committed
117 118 119 120 121
    data = pd.read_csv(data_buffer,names = data_df.columns, 
                       chunksize = chunksize, dtype = out_dtypes, 
                       parse_dates = date_columns,
                       delimiter=properties.internal_delimiter,
                       quotechar='\0',escapechar='\0')
iregon's avatar
iregon committed
122
    valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)
iregon's avatar
Cleaned  
iregon committed
123

iregon's avatar
iregon committed
124 125
    return data, valid

iregon's avatar
iregon committed
126
def validate_arg(arg_name,arg_value,arg_type):
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
    """
    
    Validates input argument is as expected type
    
    Parameters
    ----------
    arg_name : str
    arg_value : arg_type
    arg_type : python type

    Returns
    -------
    True,False
        
    """
    
iregon's avatar
iregon committed
143
    if arg_value and not isinstance(arg_value,arg_type):
iregon's avatar
Cleaned  
iregon committed
144
        logging.error('Argument {0} must be {1}, input type is {2}'.format(arg_name,arg_type,type(arg_value)))
iregon's avatar
iregon committed
145 146 147
        return False
    else:
        return True
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
148

iregon's avatar
iregon committed
149
def validate_path(arg_name,arg_value):
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
    """
    
    Validates input argument is an existing directory
    
    Parameters
    ----------
    arg_name : str
    arg_value : str

    Returns
    -------
    True,False
        
    """
    
iregon's avatar
iregon committed
165 166 167 168 169 170
    if arg_value and not os.path.isdir(arg_value):
        logging.error('{0} could not find path {1}'.format(arg_name,arg_value))
        return False
    else:
        return True

171 172 173
# END AUX FUNCTIONS -----------------------------------------------------------
        

174
def main(source, data_model = None, data_model_path = None, sections = None,chunksize = None,
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
175
         skiprows = None, out_path = None ):
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
    """
    
    Reads a data file to a pandas DataFrame using a pre-defined data model.
    Read data is validates against its data model producing a boolean mask
    on output.
    
    The data model needs to be input to the module as a named model 
    (included in the module) or as the path to a valid data model.
    
    Arguments
    ---------
    source : str
        The file path to read
        
    Keyword Arguments
    -----------------
    data_model : str, optional
        Name of internally available data model 
    data_model_path : str, optional
        Path to external data model 
    sections : list, optional
        List with subset of data model sections to outpu (default is
        all)
    chunksize : int, optional
        Number of reports per chunk (default is
        no chunking)    
    skiprows : int, optional
        Number of initial rows to skip from file (default is 0)
    out_path : str, optional
        Path to output data, valid mask and attributes (default is
        no output)
    
    Returns
    -------
    output : object
        Attributes data, mask and atts contain the corresponding
        information from the data file.
       
    Note
    ----
    
    This module can also be run as a script, with the keyword arguments
    as name_arg=arg
    
    
        
    
    """
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
224 225 226
    logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s',
                    level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None)

iregon's avatar
iregon committed
227
    # 0. VALIDATE INPUT
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
228 229 230
    if not data_model and not data_model_path:
        logging.error('A valid data model name or path to data model must be provided')
        return
231 232 233
    if not os.path.isfile(source):
        logging.error('Can\'t find input data file {}'.format(source))
        return
iregon's avatar
iregon committed
234 235 236
    if not validate_arg('sections',sections,list):
        return
    if not validate_arg('chunksize',chunksize,int):
iregon's avatar
Cleaned  
iregon committed
237
        return
iregon's avatar
iregon committed
238
    if not validate_arg('skiprows',skiprows,int):
iregon's avatar
Cleaned  
iregon committed
239
        return
iregon's avatar
iregon committed
240 241 242 243
    if not validate_path('data_model_path',data_model_path):
        return
    if not validate_path('out_path',out_path):
        return
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
244

iregon's avatar
iregon committed
245
    # 1. GET DATA MODEL
iregon's avatar
iregon committed
246 247 248
    # Schema reader will return empty if cannot read schema or is not valid
    # and will log the corresponding error
    # multiple_reports_per_line error also while reading schema
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
249 250 251 252
    logging.info("READING DATA MODEL SCHEMA FILE...")
    schema = schemas.read_schema( schema_name = data_model, ext_schema_path = data_model_path)
    if not schema:
        return
iregon's avatar
iregon committed
253 254 255
    if data_model:
        model_path = os.path.join(schema_lib,data_model)
    else:
iregon's avatar
Cleaned  
iregon committed
256
        model_path = data_model_path
iregon's avatar
iregon committed
257
    code_tables_path = os.path.join(model_path,'code_tables')
iregon's avatar
Cleaned  
iregon committed
258 259


iregon's avatar
iregon committed
260
    # 2. READ AND VALIDATE DATA
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
261
    imodel = data_model if data_model else data_model_path
iregon's avatar
Cleaned  
iregon committed
262 263 264
    logging.info("EXTRACTING DATA FROM MODEL: {}".format(imodel))

    # 2.1. Subset data model sections to requested sections
bearecinos's avatar
bearecinos committed
265
    encoding = schema['header'].get('encoding')
iregon's avatar
iregon committed
266 267 268 269 270
    parsing_order = schema['header'].get('parsing_order')
    if not sections:
        sections = [ x.get(y) for x in parsing_order for y in x ]
        read_sections_list = [y for x in sections for y in x]
    else:
iregon's avatar
Cleaned  
iregon committed
271 272
        read_sections_list = sections

iregon's avatar
iregon committed
273 274 275
    # 2.2 Homogeneize input data to an iterable with dataframes:
    # a list with a single dataframe or a pd.io.parsers.TextFileReader
    logging.info("Getting data string from source...")
bearecinos's avatar
bearecinos committed
276
    TextParser = import_data.main(source, encoding=encoding, chunksize = chunksize, skiprows = skiprows)
iregon's avatar
Cleaned  
iregon committed
277

iregon's avatar
iregon committed
278 279 280
    # 2.3. Extract, read and validate data in same loop
    logging.info("Extracting and reading sections")
    data,valid = ERV(TextParser,read_sections_list, schema, code_tables_path)
iregon's avatar
Cleaned  
iregon committed
281

iregon's avatar
iregon committed
282
    # 3. CREATE OUTPUT DATA ATTRIBUTES 
iregon's avatar
Cleaned  
iregon committed
283
    logging.info("CREATING OUTPUT DATA ATTRIBUTES FROM DATA MODEL")
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
284
    data_columns = [ x for x in data ] if isinstance(data,pd.DataFrame) else data.orig_options['names']
iregon's avatar
iregon committed
285
    out_atts = schemas.df_schema(data_columns, schema)
iregon's avatar
iregon committed
286

iregon's avatar
iregon committed
287
    # 4. OUTPUT TO FILES IF REQUESTED
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
288
    if out_path:
iregon's avatar
iregon committed
289 290 291 292 293
        enlisted = False
        if not isinstance(data,pd.io.parsers.TextFileReader):
            data = [data]
            valid = [valid]
            enlisted = True
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
294
        logging.info('WRITING DATA TO FILES IN: {}'.format(out_path))
iregon's avatar
iregon committed
295 296 297 298 299 300 301 302 303 304 305 306 307

        for i, (data_df,valid_df) in enumerate(zip(data,valid)):
            header = False
            mode = 'a'
            if i == 0:
                mode = 'w'
                cols = [ x for x in data_df ]
                if isinstance(cols[0],tuple):
                    header = [":".join(x) for x in cols]
                    out_atts_json = { ":".join(x):out_atts.get(x) for x in out_atts.keys() }
                else:
                    header = cols
                    out_atts_json = out_atts
iregon's avatar
iregon committed
308 309 310
            kwargs = {'header' : header, 'mode' : mode, 'encoding' : 'utf-8','index' : True, 'index_label' : 'index','quotechar':'\0','escapechar':'\0'}
            data_df.to_csv(os.path.join(out_path,'data.csv'), **kwargs)
            valid_df.to_csv(os.path.join(out_path,'mask.csv'), **kwargs)
iregon's avatar
iregon committed
311 312 313
        if enlisted:
            data = data[0]
            valid = valid[0]
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
314
        else:
iregon's avatar
iregon committed
315 316
            data = pandas_TextParser_hdlr.restore(data.f,data.orig_options)
            valid = pandas_TextParser_hdlr.restore(valid.f,valid.orig_options)
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
317 318
        with open(os.path.join(out_path,'atts.json'),'w') as fileObj:
            json.dump(out_atts_json,fileObj,indent=4)
iregon's avatar
iregon committed
319

iregon's avatar
iregon committed
320
    # 5. RETURN DATA
321
    class output():
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
        """ Class to represent reader output
    
    
        Attributes
        ----------
        data : str
            a pandas.DataFrame or pandas.io.parsers.TextFileReader
            with the output data
        atts : dict
            a dictionary with the output data elements attributes
        mask : str
            a pandas.DataFrame or pandas.io.parsers.TextFileReader
            with the output data validation mask
        
        """
        
338 339 340 341 342 343 344
        def __init__(self):
            self.data = data
            self.atts = out_atts
            self.mask = valid
        
    
    return output()
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
345