import_data.py 2.64 KB
Newer Older
iregon's avatar
iregon committed
1 2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
iregon's avatar
iregon committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
#"""
#Created on Fri Jan 10 13:17:43 2020
#
#FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
#    AN ITERABLE WITH DATAFRAMES
#
#INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
#
#OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
#    - a single dataframe in a list
#    - a pd.io.parsers.textfilereader
#
#
#WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
#
#delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
#to be stripped
#
#@author: iregon
#
#
#
#OPTIONS IN OLD DEVELOPMENT:
#    1. DLMT: delimiter = ',' default
#    names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
#    missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
#    TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
#                                 dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
#                                 skiprows = skiprows, names = names, na_values = missing)
#
#    2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
#    this applies mainly when reading elements from sections, but we leave it also here
#    TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
#
#"""
iregon's avatar
iregon committed
38 39 40

import pandas as pd
import os
iregon's avatar
iregon committed
41

iregon's avatar
iregon committed
42
from .. import properties
iregon's avatar
iregon committed
43

44
def main(source,chunksize = None, skiprows = None):
iregon's avatar
iregon committed
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
    """

    Returns an iterable object with a pandas dataframe from
    an input data source. The pandas dataframe has a report
    per row and a single column with the full report as a
    block string.
    Currently only supports a data file path as source data,
    but could be easily extended to accept a different
    source object.
    
    Parameters
    ----------
    source : str
        Path to data file
        
    Keyword Arguments
    -----------------
    chunksize : int, opt
        Number of lines to chunk the input data into
    skiprows : int, opt
        Number of lines to skip from input file


    Returns
    -------
    iterable 
        List of with a single pandas dataframe
        or pandas.io.parsers.textfilereader
    

    """
    
77
    if os.path.isfile(source):
iregon's avatar
iregon committed
78
        TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize, quotechar='\0',escapechar='\0')
79 80
        if not chunksize:
            TextParser = [TextParser]
iregon's avatar
iregon committed
81
        return TextParser
iregon's avatar
iregon committed
82 83
    else:
        print('Error')
iregon's avatar
iregon committed
84
        return