import_data.py 2.52 KB
Newer Older
iregon's avatar
iregon committed
1 2 3 4 5
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020

iregon's avatar
iregon committed
6 7 8 9 10 11 12
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
    AN ITERABLE WITH DATAFRMAES

INPUT IS EITHER:
    - pd.io.parsers.textfilereader
    - io.StringIO
    - file path
iregon's avatar
Cleaned  
iregon committed
13

iregon's avatar
iregon committed
14 15 16 17 18 19
OUTPUT IS AN ITERABLE, DEPENDING ON SOURCE TYPE AND CHUNKSIZE BEING SET:
    - a single dataframe in a list
    - a pd.io.parsers.textfilereader


WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
iregon's avatar
iregon committed
20

iregon's avatar
Cleaned  
iregon committed
21
delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
iregon's avatar
iregon committed
22 23 24 25
to be stripped

@author: iregon

iregon's avatar
iregon committed
26 27 28
DEV NOTES:
1) What this module is able to ingest needs to align with properties.supported_sources
2) Check io.StringIO input: why there, does it actually work as it is?
iregon's avatar
Cleaned  
iregon committed
29
3) Check pd.io.parsers.textfilereader input: why there, does it actually work as it is?
iregon's avatar
iregon committed
30 31


iregon's avatar
iregon committed
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
OPTIONS IN OLD DEVELOPMENT:
    1. DLMT: delimiter = ',' default
        names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
        missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
        TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
                                 dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
                                 skiprows = skiprows, names = names, na_values = missing)

    2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
    this applies mainly when reading elements from sections, but we leave it also here
    TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)

"""

import pandas as pd
import os
iregon's avatar
iregon committed
48 49
import io

iregon's avatar
iregon committed
50
from .. import properties
iregon's avatar
iregon committed
51

iregon's avatar
iregon committed
52 53 54 55
def to_iterable_df(source,skiprows = None, chunksize = None):
    TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
    if not chunksize:
        TextParser = [TextParser]
iregon's avatar
Cleaned  
iregon committed
56 57
    return TextParser

iregon's avatar
iregon committed
58

iregon's avatar
iregon committed
59 60 61
def import_data(source,chunksize = None, skiprows = None):

    if isinstance(source,pd.io.parsers.TextFileReader):
iregon's avatar
iregon committed
62 63
        return source
    elif isinstance(source, io.StringIO):
64
        TextParser = to_iterable_df(source,skiprows = skiprows, chunksize = chunksize)
iregon's avatar
iregon committed
65
        return TextParser
iregon's avatar
iregon committed
66
    elif os.path.isfile(source):
67
        TextParser = to_iterable_df(source,skiprows = skiprows, chunksize = chunksize)
iregon's avatar
iregon committed
68
        return TextParser
iregon's avatar
iregon committed
69 70
    else:
        print('Error')
iregon's avatar
iregon committed
71
        return