read_sections.py 9.14 KB
Newer Older
iregon's avatar
iregon committed
1 2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
iregon's avatar
iregon committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
#"""
#Created on Fri Jan 10 13:17:43 2020
#
#Extracts and reads (decodes, scales, etc...) the elements of data sections.
#Each column of the input dataframe is a section with all its elements stored
#as a single string.
#
#Working on a section by section basis, this module uses the data model
#information provided in the schema to split the elements, decode and scale them
#where appropriate and ensure its data type consistency.
#
#Output is a dataframe with columns as follows depending on the data model
#structure:
#    1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
#    2) Data model with no sections[element0...element1]
#
#
#DEV NOTES:
#1) the 'quoted' issue: in version 1.0:
# # Writing options from quoting on to prevent supp buoy data to be quoted:
# # maybe this happenned because buoy data has commas, and pandas makes its own decission about
# # how to write that.....
# #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
# # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
# sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
#
# But we were still experiencing problems when reading fully empty sections, now
# we only write to the section buffer reports that are not empty. We afterwards
# recover the indexes....
#
#@author: iregon
#"""
iregon's avatar
iregon committed
35 36 37

import pandas as pd
from io import StringIO as StringIO
iregon's avatar
iregon committed
38
import csv
iregon's avatar
iregon committed
39 40 41 42

from .. import properties
from ..common.converters import converters
from ..common.decoders import decoders
iregon's avatar
iregon committed
43 44 45 46 47 48 49 50

def extract_fixed_width(section_serie_bf,section_schema):
    # Read section elements descriptors
    section_names = section_schema['elements'].keys()
    section_widths = list(map(lambda x: x if x else properties.MAX_FULL_REPORT_WIDTH, [ section_schema['elements'][i].get('field_length') for i in section_names ]))
    section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
                               else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
                               for i in section_names }
iregon's avatar
iregon committed
51 52 53 54 55
    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths,
                                   header = None, names = section_names , 
                                   na_values = section_missing, encoding = 'utf-8', 
                                   dtype = 'object', skip_blank_lines = False,
                                   quotechar='\0',escapechar='\0')
iregon's avatar
iregon committed
56 57
    return section_elements

58
def extract_delimited(section_serie_bf,section_schema):
iregon's avatar
iregon committed
59 60 61
    delimiter = section_schema['header'].get('delimiter')
    section_names = section_schema['elements'].keys()
    section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names }
iregon's avatar
iregon committed
62 63 64 65
    section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, 
                                   encoding = 'utf-8', dtype = 'object', 
                                   skip_blank_lines = False, names = section_names, 
                                   na_values = section_missing,quotechar='\0',escapechar='\0')
66

iregon's avatar
iregon committed
67 68
    return section_elements

69
def read_data(section_df,section_schema):
iregon's avatar
iregon committed
70 71 72 73
    section_names = section_df.columns
    section_dtypes = { i:section_schema['elements'][i]['column_type'] for i in section_names }
    encoded = [ (x) for x in section_names if 'encoding' in section_schema['elements'][x]]
    section_encoding = { i:section_schema['elements'][i]['encoding'] for i in encoded }
iregon's avatar
iregon committed
74
    section_valid = pd.DataFrame(index = section_df.index, columns = section_df.columns)
75

iregon's avatar
iregon committed
76
    for element in section_dtypes.keys():
iregon's avatar
iregon committed
77
        missing = section_df[element].isna()
iregon's avatar
iregon committed
78 79 80 81
        if element in encoded:
            section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])
        kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element))  }
        section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)
82
        
iregon's avatar
iregon committed
83

iregon's avatar
iregon committed
84 85
        section_valid[element] = missing | section_df[element].notna()
    return section_df,section_valid
iregon's avatar
iregon committed
86

87
def main(sections_df, schema):
iregon's avatar
iregon committed
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
    """

    Returns a pandas dataframe with a report per row
    and the report sections split along the columns.
    Each section is a block string and only the sections
    listed in read_sections parameter are output.
    
    Parameters
    ----------
    sections_df : pandas.DataFrame
        Pandas dataframe with a column per report sections.
        The sections in the columns as a block strings.    
    schema : dict 
        Data source data model schema 

    Returns
    -------
    data : pandas.DataFrame 
        Dataframe with the report section elements split 
        along the columns. Multiindex if bla, regular index
        if ble
    mask : pandas.DataFrame 
        Dataframe with the report section elements split 
        along the columns. Multiindex if bla, regular index
        if ble   
    dtypes : dict
        Dictionary with pandas data types for each of the
        output elements
        
    """
iregon's avatar
iregon committed
118
    multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False
119 120
    data_df = pd.DataFrame(index = sections_df.index)
    valid_df = pd.DataFrame(index = sections_df.index)
iregon's avatar
iregon committed
121
    out_dtypes = dict()
122 123

    for section in sections_df.columns:
iregon's avatar
iregon committed
124 125 126
        print('Reading section {}'.format(section))
        section_schema = schema['sections'].get(section)
        disable_read = section_schema.get('header').get('disable_read')
127 128

        if not disable_read:
iregon's avatar
iregon committed
129 130 131 132 133 134
            field_layout = section_schema.get('header').get('field_layout')
            ignore = [ i for i in section_schema['elements'].keys() if section_schema['elements'][i].get('ignore') ] # evals to True if set and true, evals to False if not set or set and false
             # Get rid of false delimiters in fixed_width
            delimiter = section_schema['header'].get('delimiter')
            if delimiter and field_layout == 'fixed_width':
                sections_df[section] = sections_df[section].str.replace(delimiter,'')
135

iregon's avatar
iregon committed
136 137 138
            section_buffer = StringIO()
            # Here indices are lost, have to give the real ones, those in section_strings:
            # we'll see if we do that in the caller module or here....
139 140 141
            # Only pass records with data to avoid the hassle of dealing with
            # how the NaN rows are written and then read!
            notna_idx = sections_df[sections_df[section].notna()].index
iregon's avatar
iregon committed
142
            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,quotechar='\0',escapechar='\0',sep=properties.internal_delimiter)
iregon's avatar
iregon committed
143
            ssshh = section_buffer.seek(0)
iregon's avatar
iregon committed
144
            # Get the individual elements as objects
iregon's avatar
iregon committed
145 146 147 148
            if field_layout == 'fixed_width':
                section_elements_obj = extract_fixed_width(section_buffer,section_schema)
            elif field_layout == 'delimited':
                section_elements_obj = extract_delimited(section_buffer,section_schema)
149

iregon's avatar
iregon committed
150
            section_elements_obj.drop(ignore, axis = 1, inplace = True)
151

iregon's avatar
iregon committed
152
            # Read the objects to their data types and apply decoding, scaling and so on...
153
            # Give them their actual indexes back
iregon's avatar
iregon committed
154
            section_elements, section_valid = read_data(section_elements_obj,section_schema)
155
             
156 157 158
            section_elements.index = notna_idx
            section_valid.index = notna_idx

iregon's avatar
iregon committed
159 160
        else:
            section_elements = pd.DataFrame(sections_df[section],columns = [section])
iregon's avatar
iregon committed
161
            section_valid = pd.DataFrame(index = section_elements.index,data = True, columns = [section])
162 163


iregon's avatar
iregon committed
164
        section_elements.columns = [ (section, x) for x in section_elements.columns] if multiindex else section_elements.columns
iregon's avatar
iregon committed
165
        section_valid.columns = section_elements.columns
iregon's avatar
iregon committed
166
        data_df = pd.concat([data_df,section_elements],sort = False,axis=1)
iregon's avatar
iregon committed
167
        valid_df = pd.concat([valid_df,section_valid],sort = False,axis=1)
168

169
    # Do the dtypes after removing unwnated elements, etc..
170 171
    for section in sections_df.columns:
        section_schema = schema['sections'].get(section)
172
        if not section_schema.get('header').get('disable_read'):
173 174 175 176
            elements = [ x[1] for x in data_df.columns if x[0] == section ]
            if multiindex:
                out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
            else:
177
                out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
178 179 180 181
        else:
            if multiindex:
                    out_dtypes.update({ (section,section):'object' } )
            else:
182
                out_dtypes.update({ section:'object' } )
183
    return data_df, valid_df, out_dtypes