#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jan 10 13:17:43 2020 @author: iregon """ import pandas as pd from io import StringIO as StringIO import mdf_reader.properties as properties import csv # To disable quoting from mdf_reader.common.converters import converters from mdf_reader.common.decoders import decoders def extract_fixed_width(section_serie_bf,section_schema): # Read section elements descriptors section_names = section_schema['elements'].keys() section_widths = list(map(lambda x: x if x else properties.MAX_FULL_REPORT_WIDTH, [ section_schema['elements'][i].get('field_length') for i in section_names ])) section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)] for i in section_names } section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False ) return section_elements def extract_delimited(section_serie_bf,section_schema): delimiter = section_schema['header'].get('delimiter') section_names = section_schema['elements'].keys() section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names } section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False, names = section_names, na_values = section_missing) return section_elements def read_data(section_df,section_schema): section_names = section_df.columns section_dtypes = { i:section_schema['elements'][i]['column_type'] for i in section_names } encoded = [ (x) for x in section_names if 'encoding' in section_schema['elements'][x]] section_encoding = { i:section_schema['elements'][i]['encoding'] for i in encoded } for element in section_dtypes.keys(): #missing = section_elements[element].isna() if element in encoded: section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element]) kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element)) } section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs) # section_valid[element] = missing | section_elements[element].notna() return section_df def read_sections(sections_df, schema): multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False data_df = pd.DataFrame() out_dtypes = dict() for section in sections_df.columns: print('Reading section {}'.format(section)) section_schema = schema['sections'].get(section) disable_read = section_schema.get('header').get('disable_read') if not disable_read: field_layout = section_schema.get('header').get('field_layout') ignore = [ i for i in section_schema['elements'].keys() if section_schema['elements'][i].get('ignore') ] # evals to True if set and true, evals to False if not set or set and false # Get rid of false delimiters in fixed_width delimiter = section_schema['header'].get('delimiter') if delimiter and field_layout == 'fixed_width': sections_df[section] = sections_df[section].str.replace(delimiter,'') section_buffer = StringIO() # Writing options from quoting on to prevent supp buoy data to be quoted: # maybe this happenned because buoy data has commas, and pandas makes its own decission about # how to write that..... #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...) # Here indices are lost, have to give the real ones, those in section_strings: # we'll see if we do that in the caller module or here.... sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False)#,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t") ssshh = section_buffer.seek(0) # Get the individual elements as objects if field_layout == 'fixed_width': section_elements_obj = extract_fixed_width(section_buffer,section_schema) elif field_layout == 'delimited': section_elements_obj = extract_delimited(section_buffer,section_schema) section_elements_obj.drop(ignore, axis = 1, inplace = True) # Read the objects to their data types and apply decoding, scaling and so on... section_elements = read_data(section_elements_obj,section_schema) section_elements.index = sections_df[section].index else: section_elements = pd.DataFrame(sections_df[section],columns = [section]) if not disable_read: if multiindex: out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in section_elements.columns } ) out_dtypes.update({ (section,i):section_elements[i].dtype.name for i in section_elements if section_elements[i].dtype.name in properties.numpy_floats}) else: out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in section_elements.columns } ) out_dtypes.update({ i:section_elements[i].dtype.name for i in section_elements if section_elements[i].dtype.name in properties.numpy_floats}) else: if multiindex: out_dtypes.update({ (section,section):'object' } ) else: out_dtypes.update({ section:'object' } ) section_elements.columns = [ (section, x) for x in section_elements.columns] if multiindex else section_elements.columns data_df = pd.concat([data_df,section_elements],sort = False,axis=1) return data_df,out_dtypes