converters.py 2.79 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1
import pandas as pd
2
import numpy as np
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
3

iregon's avatar
iregon committed
4 5 6
from .. import properties


Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
# 1. dtype must be defined in dtype_properties.data_types
#>>> if not np.dtype('int8'):
#...     print('No data type')
#...
#>>> if not np.dtype('int786'):
#...     print('No data type')
#...
#Traceback (most recent call last):
#  File "<stdin>", line 1, in <module>
#TypeError: data type "int786" not understood
#
#   Watch this, for my objects I want to catch both empty and blank strings as missing
#   empty_string = ''
#   blank_string = '     '
#   len(empty_string) == 0
#   len(blank_string) != 0
#   len(empty_string) == len(blank_string.lstrip()) == 0
#   So, we'll eval: len(value.lstrip())
#
# return data.astype(self.dtype, casting = 'safe')
# safe casting specifies, otherwise converts np.nan to some real number depending on dtype.

29

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
30 31 32 33 34 35 36

class df_converters():
    def __init__(self, dtype):
        self.dtype = dtype
        self.numeric_scale = 1. if self.dtype in properties.numpy_floats else 1
        self.numeric_offset = 0. if self.dtype in properties.numpy_floats else 0
    def object_to_numeric(self, data, scale = None, offset = None):
37
        
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
38 39
        scale = scale if scale else self.numeric_scale
        offset = offset if offset else self.numeric_offset
40 41
        # First do the appropriate managing of white spaces, to the right, they mean 0!
        data = data.replace(r'^\s*$', np.nan, regex=True)
42 43 44
        # str method fails if all nan
        if data.count() > 0:
            data = data.str.replace(' ', '0')
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
45 46 47 48 49 50 51 52
        #  Convert to numeric, then scale (?!) and give it's actual int type
        data = pd.to_numeric(data,errors = 'coerce') # astype fails on strings, to_numeric manages errors....!
        data = offset + data * scale
        try:
            return data.astype(self.dtype, casting = 'safe')
        except:
            return data

53
    def object_to_object(self,data,disable_white_strip = False):
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
        # With strip() an empty element after stripping, is just an empty element, no NaN...
        if not disable_white_strip:
            return data.str.strip()
        else:
            if disable_white_strip == 'l':
                return data.str.rstrip()
            elif disable_white_strip == 'r':
                return data.str.lstrip()
            else:
                return data

    def object_to_datetime(self,data, datetime_format = "%Y%m%d"):
        data = pd.to_datetime(data, format = datetime_format, errors = 'coerce')
        return data

converters = dict()
for dtype in properties.numeric_types:
    converters[dtype] = df_converters(dtype).object_to_numeric
converters['datetime'] = df_converters('datetime').object_to_datetime
converters['str'] = df_converters('str').object_to_object
converters['object'] = df_converters('object').object_to_object
75
converters['key'] = df_converters('key').object_to_object