Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
83353712
Commit
83353712
authored
5 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Added validate note
parent
0f4e44f6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
26 deletions
+29
-26
validate/validate.py
validate/validate.py
+29
-26
No files found.
validate/validate.py
View file @
83353712
...
...
@@ -13,6 +13,9 @@ Validated elements are those with the following column_types:
they should already be NaT if they not validate as a valid datetime. The
correspoding mask is just created for them
DEV notes:
need to add tolerance to the numeric range validation
@author: iregon
"""
...
...
@@ -24,24 +27,24 @@ from .. import properties
from
..schemas
import
code_tables
from
..schemas
import
schemas
def
validate_numeric
(
elements
,
data
,
schema
):
# Find thresholds in schema. Flag if not available -> warn
def
validate_numeric
(
elements
,
data
,
schema
):
# Find thresholds in schema. Flag if not available -> warn
mask
=
pd
.
DataFrame
(
index
=
data
.
index
,
data
=
False
,
columns
=
elements
)
lower
=
{
x
:
schema
.
get
(
x
).
get
(
'valid_min'
,
-
np
.
inf
)
for
x
in
elements
}
upper
=
{
x
:
schema
.
get
(
x
).
get
(
'valid_max'
,
np
.
inf
)
for
x
in
elements
}
set_elements
=
[
x
for
x
in
lower
.
keys
()
if
lower
.
get
(
x
)
!=
-
np
.
inf
and
upper
.
get
(
x
)
!=
np
.
inf
]
if
len
([
x
for
x
in
elements
if
x
not
in
set_elements
])
>
0
:
logging
.
warning
(
'Data numeric elements with missing upper or lower threshold: {}'
.
format
(
","
.
join
([
str
(
x
)
for
x
in
elements
if
x
not
in
set_elements
])))
logging
.
warning
(
'Corresponding upper and/or lower bounds set to +/-inf for validation'
)
mask
[
elements
]
=
((
data
[
elements
]
>=
[
lower
.
get
(
x
)
for
x
in
elements
]
)
&
(
data
[
elements
]
<=
[
upper
.
get
(
x
)
for
x
in
elements
]))
|
data
[
elements
].
isna
()
return
mask
def
validate_codes
(
elements
,
data
,
code_tables_path
,
schema
,
supp
=
False
):
mask
=
pd
.
DataFrame
(
index
=
data
.
index
,
data
=
False
,
columns
=
elements
)
if
os
.
path
.
isdir
(
code_tables_path
):
for
element
in
elements
:
code_table
=
schema
.
get
(
element
).
get
(
'codetable'
)
...
...
@@ -63,10 +66,10 @@ def validate_codes(elements, data, code_tables_path, schema, supp = False):
key_elements
=
[
(
element
[
0
],
x
)
for
x
in
key_elements
]
else
:
key_elements
=
[
(
properties
.
dummy_level
,
x
)
if
not
isinstance
(
x
,
tuple
)
else
x
for
x
in
key_elements
]
dtypes
=
{
x
:
properties
.
pandas_dtypes
.
get
(
schema
.
get
(
x
).
get
(
'column_type'
))
for
x
in
key_elements
}
dtypes
=
{
x
:
properties
.
pandas_dtypes
.
get
(
schema
.
get
(
x
).
get
(
'column_type'
))
for
x
in
key_elements
}
table_keys
=
code_tables
.
table_keys
(
table
)
table_keys_str
=
[
"∿"
.
join
(
x
)
if
isinstance
(
x
,
list
)
else
x
for
x
in
table_keys
]
validation_df
=
data
[
key_elements
]
validation_df
=
data
[
key_elements
]
imask
=
pd
.
Series
(
index
=
data
.
index
,
data
=
True
)
imask
.
iloc
[
np
.
where
(
validation_df
.
notna
().
all
(
axis
=
1
))[
0
]]
=
validation_df
.
iloc
[
np
.
where
(
validation_df
.
notna
().
all
(
axis
=
1
))[
0
],:].
astype
(
dtypes
).
astype
(
'str'
).
apply
(
"∿"
.
join
,
axis
=
1
).
isin
(
table_keys_str
)
mask
[
element
]
=
imask
...
...
@@ -80,42 +83,42 @@ def validate_codes(elements, data, code_tables_path, schema, supp = False):
logging
.
warning
(
'Element mask set to False'
)
continue
else
:
logging
.
error
(
'Code tables path {} not found'
.
format
(
code_tables_path
))
logging
.
error
(
'Code tables path {} not found'
.
format
(
code_tables_path
))
logging
.
warning
(
'All coded elements set to False'
)
return
mask
def
validate
(
data
,
mask0
,
schema
,
code_tables_path
):
def
validate
(
data
,
mask0
,
schema
,
code_tables_path
):
logging
.
basicConfig
(
format
=
'%(levelname)s
\t
[%(asctime)s](%(filename)s)
\t
%(message)s'
,
level
=
logging
.
INFO
,
datefmt
=
'%Y%m%d %H:%M:%S'
,
filename
=
None
)
# Check input
# Check input
if
not
isinstance
(
data
,
pd
.
DataFrame
)
or
not
isinstance
(
mask0
,
pd
.
DataFrame
):
logging
.
error
(
'Input data and mask must be a pandas data frame object'
)
return
# Get the data elements from the input data: might be just a subset of
# data model and flatten the schema to get a simple and sequential list
# of elements included in the input data
elements
=
[
x
for
x
in
data
]
elements
=
[
x
for
x
in
data
]
element_atts
=
schemas
.
df_schema
(
elements
,
schema
)
# See what elements we need to validate
numeric_elements
=
[
x
for
x
in
elements
if
element_atts
.
get
(
x
).
get
(
'column_type'
)
in
properties
.
numeric_types
]
datetime_elements
=
[
x
for
x
in
elements
if
element_atts
.
get
(
x
).
get
(
'column_type'
)
==
'datetime'
]
datetime_elements
=
[
x
for
x
in
elements
if
element_atts
.
get
(
x
).
get
(
'column_type'
)
==
'datetime'
]
coded_elements
=
[
x
for
x
in
elements
if
element_atts
.
get
(
x
).
get
(
'column_type'
)
==
'key'
]
if
any
([
isinstance
(
x
,
tuple
)
for
x
in
numeric_elements
+
datetime_elements
+
coded_elements
]):
validated_columns
=
pd
.
MultiIndex
.
from_tuples
(
list
(
set
(
numeric_elements
+
coded_elements
+
datetime_elements
)))
else
:
validated_columns
=
list
(
set
(
numeric_elements
+
coded_elements
+
datetime_elements
))
mask
=
pd
.
DataFrame
(
index
=
data
.
index
,
columns
=
data
.
columns
)
# Validate elements by dtype:
# Validate elements by dtype:
# 1. Numeric elements
mask
[
numeric_elements
]
=
validate_numeric
(
numeric_elements
,
data
,
element_atts
)
# 2. Table coded elements
# See following: in multiple keys code tables, the non parameter element,
# won't have a code_table attribute in the element_atts:
...
...
@@ -127,15 +130,15 @@ def validate(data, mask0, schema, code_tables_path):
# Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element
# pd.DatetimeIndex(df['_datetime']).year
if
len
(
coded_elements
)
>
0
:
mask
[
coded_elements
]
=
validate_codes
(
coded_elements
,
data
,
code_tables_path
,
element_atts
)
mask
[
coded_elements
]
=
validate_codes
(
coded_elements
,
data
,
code_tables_path
,
element_atts
)
# 3. Datetime elements
# Those declared as such in element_atts
# Because of the way they are converted, read into datetime,
# Because of the way they are converted, read into datetime,
# they should already be NaT if they not validate as a valid datetime;
# let's check: hurray! they are!
mask
[
datetime_elements
]
=
data
[
datetime_elements
].
notna
()
mask
[
validated_columns
]
=
mask
[
validated_columns
].
mask
(
mask0
[
validated_columns
]
==
False
,
False
)
return
mask
\ No newline at end of file
return
mask
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment