decoders.py 4.8 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3

import numpy as np
import string
iregon's avatar
iregon committed
4 5

from .. import properties
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112


#for importer, modname, ispkg in pkgutil.walk_packages(path=package.__path__,prefix=package.__name__+'.',onerror=lambda x: None):
#    print(modname.split(".")[-1])
# TO DECODE FROM OBJECT TO INTEGER
#
# Decodes input object type pd.series to a specified data type
#
# On missing data, the resulting DATA type in numerics will be as integer promotion to accomodate np.nan:
# Promotion dtype for storing NAs: integer	cast to float64
# (https://pandas.pydata.org/pandas-docs/version/0.22/gotchas.html#nan-integer-na-values-and-na-type-promotions)
#
# return base10.astype(self.dtype, casting = 'safe')
# safe casting specified, otherwise converts np.nan to some number depending on dtype.

def signed_overpunch_i(x):
        # Blanks and np.nan as missing data
        # In TDF-11, mix of overpunch and no overpunch: include integers in dictionary
        # Define decoding dictionary: should do this smart-like: None where non-existing keys!!!!
        overpunch_number =  { string.digits[i]:str(i) for i in range(0,10)}
        overpunch_number.update({ string.ascii_uppercase[i]:str(i+1) for i in range(0,9)})
        overpunch_number.update({ string.ascii_uppercase[i]:str(i-8) for i in range(9,18)})
        overpunch_number.update({'{':str(0)})
        overpunch_number.update({'<':str(0)})
        overpunch_number.update({'}':str(0)})
        overpunch_number.update({'!':str(0)})
        overpunch_factor =  { string.digits[i]:1 for i in range(0,10)}
        overpunch_factor.update({ string.ascii_uppercase[i]:1 for i in range(0,9)})
        overpunch_factor.update({ string.ascii_uppercase[i]:-1 for i in range(9,18)})
        overpunch_factor.update({'}':-1})
        overpunch_factor.update({'!':-1})
        overpunch_factor.update({'{':1})
        overpunch_factor.update({'<':1})
        try:
            n = "".join(list(map(lambda x: overpunch_number.get(x,np.nan), list(x) ))) if x==x else np.nan
            f = np.prod(list(map(lambda x: overpunch_factor.get(x,np.nan), list(x) ))) if x==x else np.nan
            converted = f*int(n) if f and n and n == n and f == f else np.nan
            return converted 
        except Exception as e:
            print('ERROR decoding element: {}'.format(x))
            print(e)
            print('Conversion sequence:')
            try:
                print('number base conversion: {}'.format(n))
            except:
                pass
            try:
                print('factor conversion: {}'.format(f))
            except:
                pass            
            return np.nan

class df_decoders():
    def __init__(self, dtype):
        self.dtype = dtype if dtype in properties.numeric_types else 'object'
    def signed_overpunch(self, data ):
        decoded_numeric = np.vectorize(signed_overpunch_i,otypes=[float])(data)
        try:
            return decoded_numeric.astype(self.dtype, casting = 'safe')
        except:
            return decoded_numeric
    def base36(self, data):
        # int(str(np.nan),36) ==> 30191
        # Had to do the following because the astype() below did not seem to convert
        # to object element-wise, but the full thing. As a result, str methods
        # in converters from objects originating here were failing: the column
        # was dtype = 'object', but the elements inside where 'int'....
        # Checked that manually a seemed to be happening that way....   
        if self.dtype == 'object' : 
            base10 = np.array([str(int(str(i), 36)) if i == i and i else np.nan for i in data ])
        else:
            base10 = np.array([int(str(i), 36) if i == i and i else np.nan for i in data ])
            
        try:
            return base10.astype(self.dtype, casting = 'safe')
        except:
            return base10



decoders = dict()

decoders['signed_overpunch'] = dict()
for dtype in properties.numeric_types:
    decoders['signed_overpunch'][dtype] = df_decoders(dtype).signed_overpunch
decoders['signed_overpunch']['key'] = df_decoders('key').signed_overpunch

decoders['base36'] = dict()
for dtype in properties.numeric_types:
    decoders['base36'][dtype] = df_decoders(dtype).base36
decoders['base36']['key'] = df_decoders('key').base36


## Now add the file format specific decoders
#import pkgutil
#import importlib
#from mdf_reader import fs_decoders
#package=fs_decoders
#for importer, modname, ispkg in pkgutil.walk_packages(path=package.__path__,prefix=package.__name__+'.',onerror=lambda x: None):
#    file_format = modname.split(".")[-1]
#    try:
#        file_format_decoders = importlib.import_module(modname, package=None).decoders
#        for decoder in file_format_decoders.keys():
#            decoders[".".join([file_format,decoder])] = file_format_decoders.get(decoder)
#    except Exception as e:
#        logging.error("Error loading {0} decoders: {1}".format(modname,e))
#