From 83353712325a5836e969d2d8d7ceaf4a12342e0c Mon Sep 17 00:00:00 2001 From: perezgonzalez-irene <iregon@noc.ac.uk> Date: Wed, 12 Feb 2020 10:21:40 +0000 Subject: [PATCH] Added validate note --- validate/validate.py | 55 +++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/validate/validate.py b/validate/validate.py index d306d4a..304811b 100644 --- a/validate/validate.py +++ b/validate/validate.py @@ -13,6 +13,9 @@ Validated elements are those with the following column_types: they should already be NaT if they not validate as a valid datetime. The correspoding mask is just created for them +DEV notes: +need to add tolerance to the numeric range validation + @author: iregon """ @@ -24,24 +27,24 @@ from .. import properties from ..schemas import code_tables from ..schemas import schemas -def validate_numeric(elements,data,schema): - # Find thresholds in schema. Flag if not available -> warn +def validate_numeric(elements,data,schema): + # Find thresholds in schema. Flag if not available -> warn mask = pd.DataFrame(index = data.index, data = False, columns = elements) lower = { x:schema.get(x).get('valid_min', -np.inf) for x in elements } upper = { x:schema.get(x).get('valid_max', np.inf) for x in elements } - + set_elements = [ x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf ] if len([ x for x in elements if x not in set_elements ]) > 0: logging.warning('Data numeric elements with missing upper or lower threshold: {}'.format(",".join([ str(x) for x in elements if x not in set_elements ]))) logging.warning('Corresponding upper and/or lower bounds set to +/-inf for validation') - + mask[elements] = ((data[elements] >= [ lower.get(x) for x in elements ] ) & (data[elements] <= [ upper.get(x) for x in elements ])) | data[elements].isna() return mask def validate_codes(elements, data, code_tables_path, schema, supp = False): - + mask = pd.DataFrame(index = data.index, data = False, columns = elements) - + if os.path.isdir(code_tables_path): for element in elements: code_table = schema.get(element).get('codetable') @@ -63,10 +66,10 @@ def validate_codes(elements, data, code_tables_path, schema, supp = False): key_elements = [ (element[0],x) for x in key_elements ] else: key_elements = [ (properties.dummy_level,x) if not isinstance(x,tuple) else x for x in key_elements ] - dtypes = { x:properties.pandas_dtypes.get(schema.get(x).get('column_type')) for x in key_elements } + dtypes = { x:properties.pandas_dtypes.get(schema.get(x).get('column_type')) for x in key_elements } table_keys = code_tables.table_keys(table) table_keys_str = [ "∿".join(x) if isinstance(x,list) else x for x in table_keys ] - validation_df = data[key_elements] + validation_df = data[key_elements] imask = pd.Series(index = data.index, data =True) imask.iloc[np.where(validation_df.notna().all(axis = 1))[0]] = validation_df.iloc[np.where(validation_df.notna().all(axis = 1))[0],:].astype(dtypes).astype('str').apply("∿".join, axis=1).isin(table_keys_str) mask[element] = imask @@ -80,42 +83,42 @@ def validate_codes(elements, data, code_tables_path, schema, supp = False): logging.warning('Element mask set to False') continue else: - logging.error('Code tables path {} not found'.format(code_tables_path)) + logging.error('Code tables path {} not found'.format(code_tables_path)) logging.warning('All coded elements set to False') return mask - -def validate(data, mask0, schema, code_tables_path): + +def validate(data, mask0, schema, code_tables_path): logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s', level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None) - - # Check input + + # Check input if not isinstance(data,pd.DataFrame) or not isinstance(mask0,pd.DataFrame): logging.error('Input data and mask must be a pandas data frame object') return - + # Get the data elements from the input data: might be just a subset of # data model and flatten the schema to get a simple and sequential list # of elements included in the input data - elements = [ x for x in data ] + elements = [ x for x in data ] element_atts = schemas.df_schema(elements, schema) # See what elements we need to validate numeric_elements = [ x for x in elements if element_atts.get(x).get('column_type') in properties.numeric_types ] - datetime_elements = [ x for x in elements if element_atts.get(x).get('column_type') == 'datetime' ] + datetime_elements = [ x for x in elements if element_atts.get(x).get('column_type') == 'datetime' ] coded_elements = [ x for x in elements if element_atts.get(x).get('column_type') == 'key' ] - + if any([isinstance(x,tuple) for x in numeric_elements + datetime_elements + coded_elements ]): validated_columns = pd.MultiIndex.from_tuples(list(set(numeric_elements + coded_elements + datetime_elements))) else: validated_columns = list(set(numeric_elements + coded_elements + datetime_elements)) - + mask = pd.DataFrame(index = data.index, columns = data.columns) - # Validate elements by dtype: + # Validate elements by dtype: # 1. Numeric elements mask[numeric_elements] = validate_numeric(numeric_elements, data, element_atts) - + # 2. Table coded elements # See following: in multiple keys code tables, the non parameter element, # won't have a code_table attribute in the element_atts: @@ -127,15 +130,15 @@ def validate(data, mask0, schema, code_tables_path): # Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element # pd.DatetimeIndex(df['_datetime']).year if len(coded_elements)> 0: - mask[coded_elements] = validate_codes(coded_elements, data, code_tables_path, element_atts) - + mask[coded_elements] = validate_codes(coded_elements, data, code_tables_path, element_atts) + # 3. Datetime elements # Those declared as such in element_atts - # Because of the way they are converted, read into datetime, + # Because of the way they are converted, read into datetime, # they should already be NaT if they not validate as a valid datetime; # let's check: hurray! they are! mask[datetime_elements] = data[datetime_elements].notna() - + mask[validated_columns] = mask[validated_columns].mask(mask0[validated_columns] == False, False) - - return mask \ No newline at end of file + + return mask -- GitLab