import_data.py 1.84 KB
Newer Older
iregon's avatar
iregon committed
1 2 3 4 5
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020

iregon's avatar
iregon committed
6
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
7
    AN ITERABLE WITH DATAFRAMES
iregon's avatar
iregon committed
8

iregon's avatar
iregon committed
9
INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
iregon's avatar
Cleaned  
iregon committed
10

11
OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
iregon's avatar
iregon committed
12 13 14 15 16
    - a single dataframe in a list
    - a pd.io.parsers.textfilereader


WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
iregon's avatar
iregon committed
17

iregon's avatar
Cleaned  
iregon committed
18
delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
iregon's avatar
iregon committed
19 20 21 22
to be stripped

@author: iregon

iregon's avatar
iregon committed
23 24


iregon's avatar
iregon committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
OPTIONS IN OLD DEVELOPMENT:
    1. DLMT: delimiter = ',' default
        names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
        missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
        TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
                                 dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
                                 skiprows = skiprows, names = names, na_values = missing)

    2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
    this applies mainly when reading elements from sections, but we leave it also here
    TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)

"""

import pandas as pd
import os
iregon's avatar
iregon committed
41

iregon's avatar
iregon committed
42
from .. import properties
iregon's avatar
iregon committed
43

44
def main(source,chunksize = None, skiprows = None):
iregon's avatar
iregon committed
45

46 47 48 49
    if os.path.isfile(source):
        TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
        if not chunksize:
            TextParser = [TextParser]
iregon's avatar
iregon committed
50
        return TextParser
iregon's avatar
iregon committed
51 52
    else:
        print('Error')
iregon's avatar
iregon committed
53
        return