validate.py 7.54 KB
Newer Older
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 09:38:17 2019

Validates elements in a pandas DataFrame against its input data model. Output
is a boolean DataFrame

Validated elements are those with the following column_types:
    - any in properties.numeric_types: range validation
    - 'key': code table validation
    - 'datetime': because of the way they are converted, read into datetime,
    they should already be NaT if they not validate as a valid datetime. The
    correspoding mask is just created for them

iregon's avatar
iregon committed
16 17 18
DEV notes:
need to add tolerance to the numeric range validation

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
19 20 21 22 23 24 25 26
@author: iregon
"""

import os
import pandas as pd
import numpy as np
import logging
from .. import properties
27 28
from ..data_models import code_tables
from ..data_models import schemas
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
29

iregon's avatar
iregon committed
30 31
def validate_numeric(elements,data,schema):
    # Find thresholds in schema. Flag if not available -> warn
iregon's avatar
iregon committed
32
    mask = pd.DataFrame(index = data.index, data = False, columns = elements)
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
33 34
    lower = { x:schema.get(x).get('valid_min', -np.inf) for x in elements }
    upper = { x:schema.get(x).get('valid_max', np.inf) for x in elements }
iregon's avatar
iregon committed
35

Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
36 37 38 39
    set_elements = [ x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf ]
    if len([ x for x in elements if x not in set_elements ]) > 0:
        logging.warning('Data numeric elements with missing upper or lower threshold: {}'.format(",".join([ str(x) for x in elements if x not in set_elements ])))
        logging.warning('Corresponding upper and/or lower bounds set to +/-inf for validation')
iregon's avatar
iregon committed
40

iregon's avatar
iregon committed
41
    mask[elements] = ((data[elements] >= [ lower.get(x) for x in elements ] ) & (data[elements] <= [ upper.get(x) for x in elements ])) | data[elements].isna()
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
42 43
    return mask

iregon's avatar
iregon committed
44
def validate_codes(elements, data, code_tables_path, schema, supp = False):
iregon's avatar
iregon committed
45

iregon's avatar
iregon committed
46
    mask = pd.DataFrame(index = data.index, data = False, columns = elements)
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
47
    
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
    if os.path.isdir(code_tables_path):
        for element in elements:
            code_table = schema.get(element).get('codetable')
            if not code_table:
                logging.error('Code table not defined for element {}'.format(element))
                logging.warning('Element mask set to False')
            else:
                code_table_path = os.path.join(code_tables_path, code_table + '.json')
                # Eval elements: if ._yyyy, ._xxx in name: pd.DateTimeIndex().xxxx is the element to pass
                # Additionally, on doing this, should make sure that element is a datetime type:
                if os.path.isfile(code_table_path):
                    try:
                        table = code_tables.read_table(code_table_path)
                        if supp:
                            key_elements = [ element[1] ] if not table.get('_keys') else list(table['_keys'].get(element[1]))
                        else:
                            key_elements = [ element ] if not table.get('_keys') else list(table['_keys'].get(element))
                        if supp:
                            key_elements = [ (element[0],x) for x in key_elements ]
                        else:
                            key_elements = [ (properties.dummy_level,x) if not isinstance(x,tuple) else x for x in key_elements ]
iregon's avatar
iregon committed
69
                        dtypes =  { x:properties.pandas_dtypes.get(schema.get(x).get('column_type')) for x in key_elements }
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
70 71
                        table_keys = code_tables.table_keys(table)
                        table_keys_str = [ "∿".join(x) if isinstance(x,list) else x for x in table_keys ]
iregon's avatar
iregon committed
72
                        validation_df = data[key_elements]
iregon's avatar
iregon committed
73
                        imask = pd.Series(index = data.index, data =True)
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
74 75 76 77 78 79 80 81 82 83 84 85
                        imask.iloc[np.where(validation_df.notna().all(axis = 1))[0]] = validation_df.iloc[np.where(validation_df.notna().all(axis = 1))[0],:].astype(dtypes).astype('str').apply("∿".join, axis=1).isin(table_keys_str)
                        mask[element] = imask
                    except Exception as e:
                        logging.error('Error validating coded element {}:'.format(element))
                        logging.error('Error is {}:'.format(e))
                        logging.warning('Element mask set to False')
                else:
                    logging.error('Error validating coded element {}:'.format(element))
                    logging.error('Code table file {} not found'.format(code_table_path))
                    logging.warning('Element mask set to False')
                    continue
    else:
iregon's avatar
iregon committed
86
        logging.error('Code tables path {} not found'.format(code_tables_path))
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
87 88 89 90
        logging.warning('All coded elements set to False')

    return mask

iregon's avatar
iregon committed
91 92

def validate(data, mask0, schema, code_tables_path):
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
93 94
    logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s',
                    level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None)
iregon's avatar
iregon committed
95 96

    # Check input
iregon's avatar
iregon committed
97 98
    if not isinstance(data,pd.DataFrame) or not isinstance(mask0,pd.DataFrame):
        logging.error('Input data and mask must be a pandas data frame object')
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
99
        return
iregon's avatar
iregon committed
100

iregon's avatar
iregon committed
101 102 103
    # Get the data elements from the input data: might be just a subset of
    # data model and flatten the schema to get a simple and sequential list
    # of elements included in the input data
iregon's avatar
iregon committed
104
    elements = [ x for x in data ]
iregon's avatar
iregon committed
105 106 107
    element_atts = schemas.df_schema(elements, schema)
    # See what elements we need to validate
    numeric_elements =  [ x for x in elements if element_atts.get(x).get('column_type') in properties.numeric_types ]
iregon's avatar
iregon committed
108
    datetime_elements = [ x for x in elements if element_atts.get(x).get('column_type') == 'datetime' ]
iregon's avatar
iregon committed
109
    coded_elements =    [ x for x in elements if element_atts.get(x).get('column_type') == 'key' ]
iregon's avatar
iregon committed
110

iregon's avatar
iregon committed
111 112 113 114
    if any([isinstance(x,tuple) for x in numeric_elements + datetime_elements + coded_elements ]):
        validated_columns = pd.MultiIndex.from_tuples(list(set(numeric_elements + coded_elements + datetime_elements)))
    else:
        validated_columns = list(set(numeric_elements + coded_elements + datetime_elements))
iregon's avatar
iregon committed
115

iregon's avatar
iregon committed
116
    mask = pd.DataFrame(index = data.index, columns = data.columns)
Irene Perez Gonzalez's avatar
Irene Perez Gonzalez committed
117

iregon's avatar
iregon committed
118
    # Validate elements by dtype:
iregon's avatar
iregon committed
119 120
    # 1. Numeric elements
    mask[numeric_elements] = validate_numeric(numeric_elements, data, element_atts)
iregon's avatar
iregon committed
121

iregon's avatar
iregon committed
122 123 124 125 126 127 128 129 130 131 132
    # 2. Table coded elements
    # See following: in multiple keys code tables, the non parameter element,
    # won't have a code_table attribute in the element_atts:
    # So we need to check the code_table.keys files in addition to the element_atts
    # Additionally, a YEAR key can fail in one table, but be compliant with anbother, then, how would we mask this?
    #               also, a YEAR defined as an integer, will undergo its own check.....
    # So I think we need to check nested keys as a whole, and mask only the actual parameterized element:
    # Get the full list of keys combinations (tuples, triplets...) and check the column combination against that: if it fails, mark the element!
    # Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element
    # pd.DatetimeIndex(df['_datetime']).year
    if len(coded_elements)> 0:
iregon's avatar
iregon committed
133 134
        mask[coded_elements] = validate_codes(coded_elements, data, code_tables_path, element_atts)

iregon's avatar
iregon committed
135 136
    # 3. Datetime elements
    # Those declared as such in element_atts
iregon's avatar
iregon committed
137
    # Because of the way they are converted, read into datetime,
iregon's avatar
iregon committed
138 139 140
    # they should already be NaT if they not validate as a valid datetime;
    # let's check: hurray! they are!
    mask[datetime_elements] = data[datetime_elements].notna()
iregon's avatar
iregon committed
141

iregon's avatar
iregon committed
142
    mask[validated_columns] = mask[validated_columns].mask(mask0[validated_columns] == False, False)
iregon's avatar
iregon committed
143 144

    return mask