First commit

0a2f4c89 · Irene Perez Gonzalez · 0a2f4c89 · 0a2f4c89 · 0a2f4c89 · 0a2f4c89
Commit 0a2f4c89 authored 5 years ago by Irene Perez Gonzalez
5 changed files
--- a/tests/data/063-714_2010-07_subset.imma
+++ b/tests/data/063-714_2010-07_subset.imma
--- a/tests/tests.py
+++ b/tests/tests.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 20 08:05:50 2019
+
+@author: iregon
+"""
+
+import os
+import mdf_reader
+import pandas as pd
+import numpy as np
+from io import StringIO
+import mdf_reader.common.pandas_TextParser_hdlr as pandas_TextParser_hdlr
+import mdf_reader.common.plots as plots
+
+funPath = os.path.dirname(os.path.abspath(__file__))
+data_path = os.path.join(funPath,'data')
+schema_lib = os.path.join(os.path.dirname(funPath),'schemas','lib')
+
+# A. TESTS TO READ FROM DATA FROM DIFFERENT DATA MODELS WITH AND WITHOUT SUPP
+# -----------------------------------------------------------------------------
+def read_imma1_buoys_nosupp(plot_validation=False):
+    schema = 'imma1'
+    data_file_path = os.path.join(data_path,'063-714_2010-07_subset.imma')
+    data = mdf_reader.read(data_file_path, data_model = schema)
+    if plot_validation:
+        plots.plot_model_validation(data)
+    return data
+
+def read_imma1_buoys_supp(plot_validation=False):
+    schema = 'imma1'
+    schema_supp = 'cisdm_dbo_imma1'
+    data_file_path = os.path.join(data_path,'063-714_2010-07_subset.imma')
+    supp_section = 'c99'
+    supp_model = schema_supp
+    data = mdf_reader.read(data_file_path, data_model = schema, supp_section = supp_section, supp_model = supp_model )
+    if plot_validation:
+        plots.plot_model_validation(data) 
+    return data
+
+# B. TESTS TO TEST CHUNKING
+# -----------------------------------------------------------------------------
+# FROM FILE: WITH AND WITHOUT SUPPLEMENTAL
+def read_imma1_buoys_nosupp_chunks():
+    data_model = 'imma1'
+    chunksize = 10000
+    data_file_path = os.path.join(data_path,'063-714_2010-07_subset.imma')
+    return mdf_reader.read(data_file_path, data_model = data_model, chunksize = chunksize)
+
+def read_imma1_buoys_supp_chunks():
+    data_file_path = os.path.join(data_path,'063-714_2010-07_subset.imma')
+    chunksize = 10000
+    data_model = 'imma1'
+    supp_section = 'c99'
+    supp_model = 'cisdm_dbo_imma1'
+    return mdf_reader.read(data_file_path, data_model = data_model,supp_section = supp_section, supp_model = supp_model, chunksize = chunksize)
+
+
+
+        
+        
+  
--- a/tests/tests_init.py
+++ b/tests/tests_init.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 20 08:05:50 2019
+
+@author: iregon
+"""
+
+import os
+import mdf_reader
+import pandas as pd
+import numpy as np
+from io import StringIO
+import mdf_reader.common.pandas_TextParser_hdlr as pandas_TextParser_hdlr
+
+funPath = os.path.dirname(os.path.abspath(__file__))
+data_path = os.path.join(funPath,'data')
+schema_lib = os.path.join(os.path.dirname(funPath),'schemas','lib')
+
+# A. TESTS TO READ FROM DATA FROM DIFFERENT INPUTS
+# -----------------------------------------------------------------------------
+#   FROM FILE: WITH AND WIHTOUT SUPPLEMENTAL
+def imma1_buoys_nosupp():
+    schema = 'imma1'
+    data_file_path = os.path.join(data_path,'meds_2010-07_subset.imma')
+    return mdf_reader.read(data_file_path, data_model = schema)
+
+def imma1_buoys_supp():
+    schema = 'imma1'
+    schema_supp = 'cisdm_dbo_imma1'
+    data_file_path = os.path.join(data_path,'meds_2010-07_subset.imma')
+    supp_section = 'c99'
+    supp_model = schema_supp
+    return mdf_reader.read(data_file_path, data_model = schema, supp_section = supp_section, supp_model = supp_model )
+
+#   FROM DATA FRAME: WITH AND WIHTOUT SUPPLEMENTAL
+def td11_deck187_nosupp():
+    schema = 'td11'
+    deck = '187'
+    data_file_path = os.path.join(data_path,'AZH1.ascii')
+    TextParser = pd.read_fwf(data_file_path,widths=[100000],header=None,delimiter="\t")
+    deck_data = TextParser.loc[TextParser[0].str[0:3] == deck]
+    deck_data.index = range(0,len(deck_data))
+    return mdf_reader.read(deck_data,data_model = schema)
+
+def td11_deck187_supp():
+    schema = 'td11'
+    schema_supp = 'deck187_td11'
+    deck = '187'
+    data_file_path = os.path.join(data_path,'AZH1.ascii')
+    TextParser = pd.read_fwf(data_file_path,widths=[100000],header=None,delimiter="\t")
+    deck_data = TextParser.loc[TextParser[0].str[0:3] == deck]
+    deck_data.index = range(0,len(deck_data))
+    supp_section = 'supplemental'
+    supp_model = schema_supp
+    return mdf_reader.read(deck_data,data_model = schema,supp_section = supp_section, supp_model = supp_model )
+
+# B. TESTS TO ASSESS CHUNKING
+# -----------------------------------------------------------------------------
+# FROM FILE: WITH AND WITHOUT SUPPLEMENTAL
+def read_imma1_buoys_nosupp_chunks():
+    data_model = 'imma1'
+    chunksize = 10000
+    data_file_path = os.path.join(data_path,'meds_2010-07_subset.imma')
+    return mdf_reader.read(data_file_path, data_model = data_model, chunksize = chunksize)
+
+def read_imma1_buoys_supp_chunks():
+    data_file_path = os.path.join(data_path,'meds_2010-07_subset.imma')
+    chunksize = 10000
+    data_model = 'imma1'
+    supp_section = 'c99'
+    supp_model = 'cisdm_dbo_imma1'
+    return mdf_reader.read(data_file_path, data_model = data_model,supp_section = supp_section, supp_model = supp_model, chunksize = chunksize)
+
+def assess_read_from_file_supp_chunk_options():
+    nosupp_nochunk = read_imma1_buoys_nosupp()
+    supp_nochunk = read_imma1_buoys_supp()
+    io_nosupp_chunk = read_imma1_buoys_nosupp_chunks()
+    nosupp_chunk = pd.DataFrame()
+    for df in io_nosupp_chunk:
+        nosupp_chunk = pd.concat([nosupp_chunk,df])
+    io_supp_chunk = read_imma1_buoys_supp_chunks()
+    supp_chunk = pd.DataFrame()
+    for df in io_supp_chunk:
+        supp_chunk = pd.concat([supp_chunk,df])
+    
+    print('Checking differences in core data when adding supplemental data with no chunking')
+    if not nosupp_nochunk.drop('c99',axis = 1,level=0).equals(supp_nochunk.drop('c99',axis = 1,level=0)):
+        print('...ERROR: differences found')
+    else:
+        print('...OK')
+     
+    print('\nChecking differences in core data when adding supplemental data with chunking')
+    if not nosupp_chunk.drop('c99',axis = 1,level=0).equals(supp_chunk.drop('c99',axis = 1,level=0)):
+        print('...ERROR: differences found')
+    else:
+        print('...OK') 
+    
+    print('\nChecking differences in data when chunking with no supplemental')
+    if not nosupp_nochunk.equals(nosupp_chunk):
+        print('...ERROR: differences found')
+    else:
+        print('...OK')
+    
+    print('\nChecking differences in full data when chunking with supplemental')
+    if not supp_nochunk.equals(supp_chunk):
+        print('...ERROR: differences found')
+    else:
+        print('...OK') 
+    return 
+
+# FROM PD.IO.PARSER.TEXTREADER: WITH AND WITHOUT SUPPLEMENTAL
+def read_td11_deck187_nosupp_chunks():
+    data_model =  'td11'
+    deck = '187'
+    data_file_path = os.path.join(data_path,'AZH1.ascii')
+    TextParser = pd.read_fwf(data_file_path,widths=[100000],header=None,delimiter="\t")
+    deck_data = TextParser.loc[TextParser[0].str[0:3] == deck]
+    deck_data.index = range(0,len(deck_data))
+    output_buffer = StringIO()
+    deck_data.to_csv(output_buffer,header = False, index = False)
+    chunksize = 10000
+    output_buffer.seek(0)
+    TextParser = pd.read_fwf(output_buffer,widths=[100000],chunksize = chunksize, header = None)
+    return mdf_reader.read(TextParser,data_model = data_model)
+
+def read_td11_deck187_supp_chunks():
+    data_model = 'td11'
+    supp_model = 'deck187_td11'
+    supp_section = 'supplemental'
+    deck = '187'
+    data_file_path = os.path.join(data_path,'AZH1.ascii')
+    TextParser = pd.read_fwf(data_file_path,widths=[100000],header=None,delimiter="\t")
+    deck_data = TextParser.loc[TextParser[0].str[0:3] == deck]
+    deck_data.index = range(0,len(deck_data))
+    output_buffer = StringIO()
+    deck_data.to_csv(output_buffer,header = False, index = False)
+    chunksize = 10000
+    output_buffer.seek(0)
+    TextParser = pd.read_fwf(output_buffer,widths=[100000],chunksize = chunksize, header = None)
+    return mdf_reader.read(TextParser,data_model = data_model ,supp_section = supp_section, supp_model = supp_model)
+
+# C. TESTS TO READ DATA MODEL SCHEMA FROM EXTERNAL SOURCE
+# -----------------------------------------------------------------------------
+def read_imma1_buoys_supp_external_models():
+    data_file_path = os.path.join(data_path,'meds_2010-07_subset.imma')
+    schema = 'imma1'
+    schema_supp = 'cisdm_dbo_imma1'
+    data_model_path = os.path.join(schema_lib,schema)
+    supp_section = 'c99'
+    supp_model_path = os.path.join(schema_lib,schema_supp)
+    return mdf_reader.read(data_file_path, data_model_path = data_model_path,supp_section = supp_section, supp_model_path = supp_model_path)
+
+
+# D. CHECK DATA SOURCES -------------------------------------------------------
+def check_data_sources():
+    data_file_path = os.path.join(data_path,'meds_2010-07_subset.imma')
+    data_ioStringIO = StringIO()
+    data_model = 'imma1'
+    with open(data_file_path,'r') as fileO:
+        data_ioStringIO.writelines(fileO.readlines())
+    data_ioStringIO.seek(0)
+    data_pandas_df = pd.read_fwf(data_file_path,widths=[100000],header=None,delimiter="\t")
+    data_pandas_tfr = pd.read_fwf(data_file_path,widths=[100000],header=None,delimiter="\t", chunksize = 1000)
+    
+    sources = {'data_file_path': data_file_path, 'data_ioStringIO': data_ioStringIO,
+               'data_pandas_df': data_pandas_df, 'data_pandas_tfr': data_pandas_tfr}
+    
+    for source in sources.keys():
+        print('Reading from source {} ....'.format(source))
+        try:
+            data = mdf_reader.read(sources.get(source), data_model = data_model, sections = ['core'])
+            if source == 'data_pandas_tfr':
+                data_c = data.get_chunk()
+                print(data_c['core']['SST'][0])
+            else:
+                print(data['core']['SST'][0]) 
+            print('.....OK')
+        except Exception as e:
+            print('ERROR: {}'.format(e))
+    
+        
+        
+  
--- a/validate/__init__.py
+++ b/validate/__init__.py
+
--- a/validate/validate.py
+++ b/validate/validate.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 30 09:38:17 2019
+
+Validates elements in a pandas DataFrame against its input data model. Output
+is a boolean DataFrame
+
+Validated elements are those with the following column_types:
+    - any in properties.numeric_types: range validation
+    - 'key': code table validation
+    - 'datetime': because of the way they are converted, read into datetime,
+    they should already be NaT if they not validate as a valid datetime. The
+    correspoding mask is just created for them
+
+@author: iregon
+"""
+
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import absolute_import
+# CAREFULL HERE:
+# Note that in Python 3, the io.open function is an alias for the built-in open function.
+# The built-in open function only supports the encoding argument in Python 3, not Python 2.
+# https://docs.python.org/3.4/library/io.html?highlight=io
+from io import StringIO as StringIO
+
+import sys
+import os
+import pandas as pd
+import numpy as np
+import logging
+from .. import properties
+from ..schemas import code_tables
+
+if sys.version_info[0] >= 3:
+    py3 = True
+else:
+    py3 = False
+    from io import BytesIO as BytesIO
+
+# Get pandas dtype for time_stamps
+toolPath = os.path.dirname(os.path.abspath(__file__))
+dirname=os.path.dirname
+schema_lib = os.path.join(dirname(toolPath),'schemas','lib')
+
+def validate_numeric(elements,df,schema): 
+    # Find thresholds in schema. Flag if not available -> warn 
+    mask = pd.DataFrame(index = df.index, data = False, columns = elements)
+    lower = { x:schema.get(x).get('valid_min', -np.inf) for x in elements }
+    upper = { x:schema.get(x).get('valid_max', np.inf) for x in elements }
+    set_elements = [ x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf ]
+    if len([ x for x in elements if x not in set_elements ]) > 0:
+        logging.warning('Data numeric elements with missing upper or lower threshold: {}'.format(",".join([ str(x) for x in elements if x not in set_elements ])))
+        logging.warning('Corresponding upper and/or lower bounds set to +/-inf for validation')
+    #mask[set_elements] = ((df[set_elements] >= [ lower.get(x) for x in set_elements ] ) & (df[set_elements] <= [ upper.get(x) for x in set_elements ])) | df[set_elements].isna()
+    mask[elements] = ((df[elements] >= [ lower.get(x) for x in elements ] ) & (df[elements] <= [ upper.get(x) for x in elements ])) | df[elements].isna()
+    return mask
+
+def validate_codes(elements, df, code_tables_path, schema, supp = False):
+    
+    mask = pd.DataFrame(index = df.index, data = False, columns = elements)
+    
+    if os.path.isdir(code_tables_path):
+        for element in elements:
+            code_table = schema.get(element).get('codetable')
+            if not code_table:
+                logging.error('Code table not defined for element {}'.format(element))
+                logging.warning('Element mask set to False')
+            else:
+                code_table_path = os.path.join(code_tables_path, code_table + '.json')
+                # Eval elements: if ._yyyy, ._xxx in name: pd.DateTimeIndex().xxxx is the element to pass
+                # Additionally, on doing this, should make sure that element is a datetime type:
+                if os.path.isfile(code_table_path):
+                    try:
+                        table = code_tables.read_table(code_table_path)
+                        if supp:
+                            key_elements = [ element[1] ] if not table.get('_keys') else list(table['_keys'].get(element[1]))
+                        else:
+                            key_elements = [ element ] if not table.get('_keys') else list(table['_keys'].get(element))
+                        if supp:
+                            key_elements = [ (element[0],x) for x in key_elements ]
+                        else:
+                            key_elements = [ (properties.dummy_level,x) if not isinstance(x,tuple) else x for x in key_elements ]
+                        dtypes =  { x:properties.pandas_dtypes.get(schema.get(x).get('column_type')) for x in key_elements } 
+                        table_keys = code_tables.table_keys(table)
+                        table_keys_str = [ "∿".join(x) if isinstance(x,list) else x for x in table_keys ]
+                        validation_df = df[key_elements]                          
+                        imask = pd.Series(index = df.index, data =True)
+                        imask.iloc[np.where(validation_df.notna().all(axis = 1))[0]] = validation_df.iloc[np.where(validation_df.notna().all(axis = 1))[0],:].astype(dtypes).astype('str').apply("∿".join, axis=1).isin(table_keys_str)
+                        mask[element] = imask
+                    except Exception as e:
+                        logging.error('Error validating coded element {}:'.format(element))
+                        logging.error('Error is {}:'.format(e))
+                        logging.warning('Element mask set to False')
+                else:
+                    logging.error('Error validating coded element {}:'.format(element))
+                    logging.error('Code table file {} not found'.format(code_table_path))
+                    logging.warning('Element mask set to False')
+                    continue
+    else:
+        logging.error('Code tables path {} not found'.format(code_tables_path)) 
+        logging.warning('All coded elements set to False')
+
+    return mask
+
+def validate(data, schema, mask0, data_model = None, data_model_path = None, supp_section = None, supp_model = None, supp_model_path = None ):  
+    # schema is the input data schema: collection of attributes for DF elements, not the data model schema
+    # data model schema info is nevertheless needed to access code tables
+    logging.basicConfig(format='%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s',
+                    level=logging.INFO,datefmt='%Y%m%d %H:%M:%S',filename=None)
+    
+    # 0. Check arguments are valid---------------------------------------------
+    if not data_model and not data_model_path: 
+        logging.error('A valid data model or data model path must be provided')
+        return
+    if supp_section:
+        if not supp_model and not supp_model_path: 
+            logging.error('A valid data model or data model path must be provided for supplemental data')
+            return    
+    if not isinstance(data,pd.DataFrame) and not isinstance(data,pd.io.parsers.TextFileReader):
+        logging.error('Input data must be a data frame or a TextFileReader object')
+        return
+    # 1. Get data models' path------------------------------------------------- 
+    if data_model:
+        model_path = os.path.join(schema_lib,data_model)
+    else:
+        model_path = data_model_path 
+    code_tables_path = os.path.join(model_path,'code_tables')
+    
+    if supp_section:
+        if supp_model:
+            supp_path = os.path.join(schema_lib,supp_model)
+        else:
+            supp_path = supp_model_path  
+        supp_code_tables_path = os.path.join(supp_path,'code_tables')
+        
+    # 2. Go--------------------------------------------------------------------
+    TextParserData = [data.copy()] if isinstance(data,pd.DataFrame) else data
+    TextParserMask = [mask0.copy()] if isinstance(mask0,pd.DataFrame) else mask0
+    output_buffer = StringIO() if py3 else BytesIO()
+    for df, mk in zip(TextParserData, TextParserMask):
+        elements = [ x for x in df if x in schema ]
+        # See what elements we need to validate: coded go to different code table paths if supplemental
+        numeric_elements =  [ x for x in elements if schema.get(x).get('column_type') in properties.numeric_types ]
+        datetime_elements = [ x for x in elements if schema.get(x).get('column_type') == 'datetime' ]   
+        coded_elements =    [ x for x in elements if schema.get(x).get('column_type') == 'key' ]
+        if supp_section:
+            supp_coded_elements = [ x for x in coded_elements if x[0] == supp_section ]
+            for x in supp_coded_elements:
+                coded_elements.remove(x)
+        
+        if any([isinstance(x,tuple) for x in numeric_elements + datetime_elements + coded_elements ]):
+            validated_columns = pd.MultiIndex.from_tuples(list(set(numeric_elements + coded_elements + datetime_elements)))
+        else:
+            validated_columns = list(set(numeric_elements + coded_elements + datetime_elements))
+        imask = pd.DataFrame(index = df.index, columns = df.columns)
+
+        # Validate elements by dtype
+        # Table coded elements can be as well numeric -> initially should not have its bounds defined in schema, but:
+        # Numeric validation will be overriden by code table validation!!!     
+        # 1. NUMERIC ELEMENTS
+        imask[numeric_elements] = validate_numeric(numeric_elements, df, schema)
+        
+        # 2. TABLE CODED ELEMENTS
+        # See following: in multiple keys code tables, the non parameter element, won't have a code_table attribute in the schema:
+        # So we need to check the code_table.keys files in addition to the schema
+        # Additionally, a YEAR key can fail in one table, but be compliant with anbother, then, how would we mask this?
+        #               also, a YEAR defined as an integer, will undergo its own check.....
+        # So I think we need to check nested keys as a whole, and mask only the actual parameterized element:
+        # Get the full list of keys combinations (tuples, triplets...) and check the column combination against that: if it fails, mark the element!
+        # Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element
+        # pd.DatetimeIndex(df['_datetime']).year
+        if len(coded_elements)> 0:
+            imask[coded_elements] = validate_codes(coded_elements, df, code_tables_path, schema)
+        try:
+            if len(supp_coded_elements)>0:
+                imask[supp_coded_elements] = validate_codes(supp_coded_elements, df, supp_code_tables_path, schema, supp = True)
+        except:
+            pass         
+        # 3. DATETIME ELEMENTS
+        # only those declared as such in schema, not _datetime
+        # Because of the way they are converted, read into datetime, they should already be NaT if they not validate as a valid datetime;
+        # let's check: hurray! they are!
+        imask[datetime_elements] = df[datetime_elements].notna()
+        imask[validated_columns] = imask[validated_columns].mask(mk[validated_columns] == False, False)
+
+        imask.to_csv(output_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
+        
+    output_buffer.seek(0)   
+    chunksize = None if isinstance(data,pd.DataFrame) else data.orig_options['chunksize']
+    mask = pd.read_csv(output_buffer,names = [ x for x in imask ], chunksize = chunksize)
+    
+    return mask
+    
\ No newline at end of file