read_sections.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020

Extracts and reads (decodes, scales, etc...) the elements of data sections.
Each column of the input dataframe is a section with all its elements stored
as a single string.

Working on a section by section basis, this module uses the data model 
information provided in the schema to split the elements, decode and scale them
where appropriate and ensure its data type consistency.

Output is a dataframe with columns as follows depending on the data model
structure:
    1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementN)]
    2) Data model with no sections[element0...element1]


DEV NOTES:
1) the 'quoted' issue: in version 1.0:
 # Writing options from quoting on to prevent supp buoy data to be quoted:
 # maybe this happenned because buoy data has commas, and pandas makes its own decission about
 # how to write that.....
 #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
 # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
 sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")   

 But we were still experiencing problems when reading fully empty sections, now
 we only write to the section buffer reports that are not empty. We afterwards
 recover the indexes....
    
@author: iregon
"""

import pandas as pd
from io import StringIO as StringIO

from .. import properties
from ..common.converters import converters
from ..common.decoders import decoders

def extract_fixed_width(section_serie_bf,section_schema):
    # Read section elements descriptors
    section_names = section_schema['elements'].keys()
    section_widths = list(map(lambda x: x if x else properties.MAX_FULL_REPORT_WIDTH, [ section_schema['elements'][i].get('field_length') for i in section_names ]))
    section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
                               else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
                               for i in section_names }
    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
    return section_elements

def extract_delimited(section_serie_bf,section_schema): 
    delimiter = section_schema['header'].get('delimiter')
    section_names = section_schema['elements'].keys()
    section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names }
    section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, encoding = 'utf-8',
                                 dtype = 'object', skip_blank_lines = False,
                                 names = section_names, na_values = section_missing)
    
    return section_elements

def read_data(section_df,section_schema): 
    section_names = section_df.columns
    section_dtypes = { i:section_schema['elements'][i]['column_type'] for i in section_names }
    encoded = [ (x) for x in section_names if 'encoding' in section_schema['elements'][x]]
    section_encoding = { i:section_schema['elements'][i]['encoding'] for i in encoded }
    section_valid = pd.DataFrame(index = section_df.index, columns = section_df.columns)
    
    for element in section_dtypes.keys():
        missing = section_df[element].isna()
        if element in encoded:
            section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])

        kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element))  }
        section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)

        section_valid[element] = missing | section_df[element].notna()
             
    return section_df,section_valid

def read_sections(sections_df, schema):
    
    multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False
    data_df = pd.DataFrame(index = sections_df.index)
    valid_df = pd.DataFrame(index = sections_df.index)
    out_dtypes = dict()
    
    for section in sections_df.columns: 
        print('Reading section {}'.format(section))
        section_schema = schema['sections'].get(section)
        disable_read = section_schema.get('header').get('disable_read')
        
        if not disable_read:     
            field_layout = section_schema.get('header').get('field_layout')
            ignore = [ i for i in section_schema['elements'].keys() if section_schema['elements'][i].get('ignore') ] # evals to True if set and true, evals to False if not set or set and false
             # Get rid of false delimiters in fixed_width
            delimiter = section_schema['header'].get('delimiter')
            if delimiter and field_layout == 'fixed_width':
                sections_df[section] = sections_df[section].str.replace(delimiter,'')
        
            section_buffer = StringIO()
            # Here indices are lost, have to give the real ones, those in section_strings:
            # we'll see if we do that in the caller module or here....
            # Only pass records with data to avoid the hassle of dealing with
            # how the NaN rows are written and then read!
            notna_idx = sections_df[sections_df[section].notna()].index
            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False)
            ssshh = section_buffer.seek(0)
            # Get the individual elements as objects
            if field_layout == 'fixed_width':
                section_elements_obj = extract_fixed_width(section_buffer,section_schema)
            elif field_layout == 'delimited':
                section_elements_obj = extract_delimited(section_buffer,section_schema)
                
            section_elements_obj.drop(ignore, axis = 1, inplace = True)
            
            # Read the objects to their data types and apply decoding, scaling and so on...
            # Give them their actual indexes back
            section_elements, section_valid = read_data(section_elements_obj,section_schema)
            section_elements.index = notna_idx
            section_valid.index = notna_idx

        else:
            section_elements = pd.DataFrame(sections_df[section],columns = [section])
            section_valid = pd.DataFrame(index = section_elements.index,data = True, columns = [section])
      
        
        section_elements.columns = [ (section, x) for x in section_elements.columns] if multiindex else section_elements.columns
        section_valid.columns = section_elements.columns
        data_df = pd.concat([data_df,section_elements],sort = False,axis=1)
        valid_df = pd.concat([valid_df,section_valid],sort = False,axis=1)
        
    # We do the actual out_dtypes here: because the full indexing occurs only
    # after concat, NaN values may arise only in data_df if a section is
    # not existing in a given report!
    for section in sections_df.columns:
        section_schema = schema['sections'].get(section)
        if not section_schema.get('header').get('disable_read'): 
            elements = [ x[1] for x in data_df.columns if x[0] == section ]
            if multiindex:
                out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
                out_dtypes.update({ (section,i):data_df[(section,i)].dtype.name for i in elements if data_df[(section,i)].dtype.name in properties.numpy_floats})
            else:
                out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } ) 
                out_dtypes.update({ i:data_df[i].dtype.name for i in section_elements if data_df[i].dtype.name in properties.numpy_floats})
        else:
            if multiindex:
                    out_dtypes.update({ (section,section):'object' } )
            else:
                out_dtypes.update({ section:'object' } ) 
                
    return data_df, valid_df, out_dtypes