read_sections.py 8.41 KB
Newer Older
iregon's avatar
iregon committed
1 2 3 4 5
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020

iregon's avatar
iregon committed
6 7 8 9 10 11 12 13 14 15 16 17 18 19
Extracts and reads (decodes, scales, etc...) the elements of data sections.
Each column of the input dataframe is a section with all its elements stored
as a single string.

Working on a section by section basis, this module uses the data model 
information provided in the schema to split the elements, decode and scale them
where appropriate and ensure its data type consistency.

Output is a dataframe with columns as follows depending on the data model
structure:
    1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementN)]
    2) Data model with no sections[element0...element1]


20 21 22 23 24 25 26 27 28 29 30 31 32
DEV NOTES:
1) the 'quoted' issue: in version 1.0:
 # Writing options from quoting on to prevent supp buoy data to be quoted:
 # maybe this happenned because buoy data has commas, and pandas makes its own decission about
 # how to write that.....
 #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
 # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
 sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")   

 But we were still experiencing problems when reading fully empty sections, now
 we only write to the section buffer reports that are not empty. We afterwards
 recover the indexes....
    
iregon's avatar
iregon committed
33 34 35 36 37
@author: iregon
"""

import pandas as pd
from io import StringIO as StringIO
iregon's avatar
iregon committed
38 39 40 41

from .. import properties
from ..common.converters import converters
from ..common.decoders import decoders
iregon's avatar
iregon committed
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

def extract_fixed_width(section_serie_bf,section_schema):
    # Read section elements descriptors
    section_names = section_schema['elements'].keys()
    section_widths = list(map(lambda x: x if x else properties.MAX_FULL_REPORT_WIDTH, [ section_schema['elements'][i].get('field_length') for i in section_names ]))
    section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
                               else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
                               for i in section_names }
    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
    return section_elements

def extract_delimited(section_serie_bf,section_schema): 
    delimiter = section_schema['header'].get('delimiter')
    section_names = section_schema['elements'].keys()
    section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names }
    section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, encoding = 'utf-8',
                                 dtype = 'object', skip_blank_lines = False,
                                 names = section_names, na_values = section_missing)
    
    return section_elements

def read_data(section_df,section_schema): 
    section_names = section_df.columns
    section_dtypes = { i:section_schema['elements'][i]['column_type'] for i in section_names }
    encoded = [ (x) for x in section_names if 'encoding' in section_schema['elements'][x]]
    section_encoding = { i:section_schema['elements'][i]['encoding'] for i in encoded }
iregon's avatar
iregon committed
68
    section_valid = pd.DataFrame(index = section_df.index, columns = section_df.columns)
iregon's avatar
iregon committed
69 70
    
    for element in section_dtypes.keys():
iregon's avatar
iregon committed
71
        missing = section_df[element].isna()
iregon's avatar
iregon committed
72 73 74 75 76 77
        if element in encoded:
            section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])

        kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element))  }
        section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)

iregon's avatar
iregon committed
78
        section_valid[element] = missing | section_df[element].notna()
79
             
iregon's avatar
iregon committed
80
    return section_df,section_valid
iregon's avatar
iregon committed
81 82 83 84

def read_sections(sections_df, schema):
    
    multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False
85 86
    data_df = pd.DataFrame(index = sections_df.index)
    valid_df = pd.DataFrame(index = sections_df.index)
iregon's avatar
iregon committed
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
    out_dtypes = dict()
    
    for section in sections_df.columns: 
        print('Reading section {}'.format(section))
        section_schema = schema['sections'].get(section)
        disable_read = section_schema.get('header').get('disable_read')
        
        if not disable_read:     
            field_layout = section_schema.get('header').get('field_layout')
            ignore = [ i for i in section_schema['elements'].keys() if section_schema['elements'][i].get('ignore') ] # evals to True if set and true, evals to False if not set or set and false
             # Get rid of false delimiters in fixed_width
            delimiter = section_schema['header'].get('delimiter')
            if delimiter and field_layout == 'fixed_width':
                sections_df[section] = sections_df[section].str.replace(delimiter,'')
        
            section_buffer = StringIO()
            # Here indices are lost, have to give the real ones, those in section_strings:
            # we'll see if we do that in the caller module or here....
105 106 107 108
            # Only pass records with data to avoid the hassle of dealing with
            # how the NaN rows are written and then read!
            notna_idx = sections_df[sections_df[section].notna()].index
            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False)
iregon's avatar
iregon committed
109
            ssshh = section_buffer.seek(0)
iregon's avatar
iregon committed
110
            # Get the individual elements as objects
iregon's avatar
iregon committed
111 112 113 114 115 116
            if field_layout == 'fixed_width':
                section_elements_obj = extract_fixed_width(section_buffer,section_schema)
            elif field_layout == 'delimited':
                section_elements_obj = extract_delimited(section_buffer,section_schema)
                
            section_elements_obj.drop(ignore, axis = 1, inplace = True)
iregon's avatar
iregon committed
117
            
iregon's avatar
iregon committed
118
            # Read the objects to their data types and apply decoding, scaling and so on...
119
            # Give them their actual indexes back
iregon's avatar
iregon committed
120
            section_elements, section_valid = read_data(section_elements_obj,section_schema)
121 122 123
            section_elements.index = notna_idx
            section_valid.index = notna_idx

iregon's avatar
iregon committed
124 125
        else:
            section_elements = pd.DataFrame(sections_df[section],columns = [section])
iregon's avatar
iregon committed
126
            section_valid = pd.DataFrame(index = section_elements.index,data = True, columns = [section])
127
      
iregon's avatar
iregon committed
128 129
        
        section_elements.columns = [ (section, x) for x in section_elements.columns] if multiindex else section_elements.columns
iregon's avatar
iregon committed
130
        section_valid.columns = section_elements.columns
iregon's avatar
iregon committed
131
        data_df = pd.concat([data_df,section_elements],sort = False,axis=1)
iregon's avatar
iregon committed
132 133
        valid_df = pd.concat([valid_df,section_valid],sort = False,axis=1)
        
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
    # We do the actual out_dtypes here: because the full indexing occurs only
    # after concat, NaN values may arise only in data_df if a section is
    # not existing in a given report!
    for section in sections_df.columns:
        section_schema = schema['sections'].get(section)
        if not section_schema.get('header').get('disable_read'): 
            elements = [ x[1] for x in data_df.columns if x[0] == section ]
            if multiindex:
                out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
                out_dtypes.update({ (section,i):data_df[(section,i)].dtype.name for i in elements if data_df[(section,i)].dtype.name in properties.numpy_floats})
            else:
                out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } ) 
                out_dtypes.update({ i:data_df[i].dtype.name for i in section_elements if data_df[i].dtype.name in properties.numpy_floats})
        else:
            if multiindex:
                    out_dtypes.update({ (section,section):'object' } )
            else:
                out_dtypes.update({ section:'object' } ) 
                
iregon's avatar
iregon committed
153
    return data_df, valid_df, out_dtypes