#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Apr 30 09:38:17 2019 Reads source data from a data model to a pandas DataFrame. Optionally, it reads supplemental data from the same source (from a different data model) and pastes that to the output DataFrame Uses the meta_formats generic submodules ('delimited' and 'fixed_width') to pre-format data source and read either generic type of data model. @author: iregon """ from __future__ import unicode_literals from __future__ import print_function from __future__ import absolute_import # CAREFULL HERE: # Note that in Python 3, the io.open function is an alias for the built-in open function. # The built-in open function only supports the encoding argument in Python 3, not Python 2. # https://docs.python.org/3.4/library/io.html?highlight=io from io import StringIO as StringIO import sys import pandas as pd import numpy as np import logging from . import meta_formats from .. import properties from . import import_data from . import get_sections from . import read_sections import copy if sys.version_info[0] >= 3: py3 = True else: py3 = False from io import BytesIO as BytesIO # Get pandas dtype for time_stamps pandas_timestamp_dtype = pd.to_datetime(pd.DataFrame(['20000101'])[0],format='%Y%m%d').dtypes def read_model(source,schema, sections = None, chunksize = None, skiprows = None): # 0. GET META FORMAT SUBCLASS --------------------------------------------- # For future use: some work already done in schema reading if schema['header'].get('multiple_reports_per_line'): logging.error('File format not yet supported') sys.exit(1) # 1. PARSE SCHEMA --------------------------------------------------------- parsing_order = schema['header'].get('parsing_order') # 2. DEFINE OUTPUT -------------------------------------------------------- # 2.1 Sections to read if not sections: sections = [ x.get(y) for x in parsing_order for y in x ] read_sections_list = [y for x in sections for y in x] else: read_sections_list = sections # 3. HOMOGENEIZE INPUT DATA (FILE OR TEXTREADER) TO AN ITERABLE TEXTREADER logging.info("Getting data string from source...") TextParser = import_data.import_data(source, chunksize = chunksize, skiprows = skiprows) # 4. EXTRACT SECTIONS IN A PARSER; EXTRACT SECTIONS HERE AND READ DATA IN # SAME LOOP? SHOULD DO.... logging.info("Extracting sections...") data_buffer = StringIO() # valid_buffer = ... for i,string_df in enumerate(TextParser): # Get sections separated in a dataframe: one per column, only requested # sections, ignore rest. sections_df = get_sections.get_sections(string_df, schema, read_sections_list) # Read elements from sections: along data chunks, resulting data types # may vary if gaps [data_df,out_dtypesi ] = read_sections.read_sections(sections_df, schema) if i == 0: out_dtypes = copy.deepcopy(out_dtypesi) for k in out_dtypesi: if out_dtypesi in properties.numpy_floats: out_dtypes.update({ k:out_dtypesi.get(k) }) data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False) # [output_buffer,valid_buffer,dtypes] = reader_function(TextParser, schema, read_sections = read_sections, idx_offset = idx_offset ) # # # 5. OUTPUT DATA:---------------------------------------------------------- # # WE'LL NEED TO POSPROCESS THIS WHEN READING MULTIPLE REPORTS PER LINE data_buffer.seek(0) # valid_buffer.seek(0) # logging.info("Wrapping output....") # chunksize = TextParser.orig_options['chunksize'] if isinstance(TextParser,pd.io.parsers.TextFileReader) else None # logging.info('Data') # # 'datetime' is not a valid pandas dtype: Only on output (on reading) will be then converted (via parse_dates) to datetime64[ns] type, cannot specify 'datetime' (of any kind) here: will fail # date_columns = [] # Needs to be the numeric index of the column, as seems not to be able to work with tupples.... # for i,element in enumerate(list(dtypes)): # if dtypes.get(element) == 'datetime': # date_columns.append(i) data_reader = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes)#, parse_dates = date_columns) # logging.info('Mask') # valid_reader = pd.read_csv(valid_buffer,names = out_names, chunksize = chunksize) # return data_reader, valid_reader return data_reader