Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
df566664
Commit
df566664
authored
5 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Adpated to renamed schemas package (data_models)
parent
2f3b546e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
139 additions
and
19 deletions
+139
-19
read.py
read.py
+139
-19
No files found.
read.py
View file @
df566664
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 09:38:17 2019
Manages the integral sequence in data file reading
from a data model:
- Access to data model
- Data file import
- Data file reading
- Data validation
- Output
Reads a data file to a pandas DataFrame using a pre-defined data model.
Contains the following functions:
The data model needs to be input to the module as a named model (included in the module) or as the path to a valid data model.
Data elements are validated against its data model after reading, producing a boolean mask.
Uses submodules:
- schemas
- reader
- valiate
@author: iregon
* ERV - does the actual extraction, read and validation of data input data
* main - the main function of the script
"""
import
os
import
sys
import
pandas
as
pd
...
...
@@ -24,20 +25,45 @@ import json
import
copy
from
io
import
StringIO
as
StringIO
from
.
import
schemas
from
.
data_models
import
schemas
from
.
import
properties
from
.common
import
pandas_TextParser_hdlr
from
.reader
import
import_data
from
.reader
import
get_sections
from
.reader
import
read_sections
from
.reader
.read_sections
import
main
as
read_sections
from
.validate
import
validate
toolPath
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
schema_lib
=
os
.
path
.
join
(
toolPath
,
'
schema
s'
,
'lib'
)
schema_lib
=
os
.
path
.
join
(
toolPath
,
'
data_model
s'
,
'lib'
)
# AUX FUNCTIONS ---------------------------------------------------------------
def
ERV
(
TextParser
,
read_sections_list
,
schema
,
code_tables_path
):
"""
Extracts, reads and validates data input data.
Parameters
----------
TextParser : list or pandas.io.parsers.TextFileReader
The data to extract and read
read_sections_list : list
List with subset of data model sections to output
schema : dict
Data model schema
code_tables_path : str
Path to data model code tables
Returns
-------
data : pandas.DataFrame, pandas.io.parsers.TextFileReader
Contains the input data extracted and read
valid : pandas.DataFrame, pandas.io.parsers.TextFileReader
Contains the a boolean mask with the data validation output
"""
data_buffer
=
StringIO
()
valid_buffer
=
StringIO
()
...
...
@@ -54,7 +80,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this!
# Sections are parsed in the same order as sections_df.columns
[
data_df
,
valid_df
,
out_dtypesi
]
=
read_sections
.
read_sections
(
sections_df
,
schema
)
[
data_df
,
valid_df
,
out_dtypesi
]
=
read_sections
(
sections_df
,
schema
)
if
i_chunk
==
0
:
out_dtypes
=
copy
.
deepcopy
(
out_dtypesi
)
...
...
@@ -94,6 +120,22 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
return
data
,
valid
def
validate_arg
(
arg_name
,
arg_value
,
arg_type
):
"""
Validates input argument is as expected type
Parameters
----------
arg_name : str
arg_value : arg_type
arg_type : python type
Returns
-------
True,False
"""
if
arg_value
and
not
isinstance
(
arg_value
,
arg_type
):
logging
.
error
(
'Argument {0} must be {1}, input type is {2}'
.
format
(
arg_name
,
arg_type
,
type
(
arg_value
)))
return
False
...
...
@@ -101,6 +143,21 @@ def validate_arg(arg_name,arg_value,arg_type):
return
True
def
validate_path
(
arg_name
,
arg_value
):
"""
Validates input argument is an existing directory
Parameters
----------
arg_name : str
arg_value : str
Returns
-------
True,False
"""
if
arg_value
and
not
os
.
path
.
isdir
(
arg_value
):
logging
.
error
(
'{0} could not find path {1}'
.
format
(
arg_name
,
arg_value
))
return
False
...
...
@@ -110,9 +167,56 @@ def validate_path(arg_name,arg_value):
# END AUX FUNCTIONS -----------------------------------------------------------
def
read
(
source
,
data_model
=
None
,
data_model_path
=
None
,
sections
=
None
,
chunksize
=
None
,
def
main
(
source
,
data_model
=
None
,
data_model_path
=
None
,
sections
=
None
,
chunksize
=
None
,
skiprows
=
None
,
out_path
=
None
):
"""
Reads a data file to a pandas DataFrame using a pre-defined data model.
Read data is validates against its data model producing a boolean mask
on output.
The data model needs to be input to the module as a named model
(included in the module) or as the path to a valid data model.
Arguments
---------
source : str
The file path to read
Keyword Arguments
-----------------
data_model : str, optional
Name of internally available data model
data_model_path : str, optional
Path to external data model
sections : list, optional
List with subset of data model sections to outpu (default is
all)
chunksize : int, optional
Number of reports per chunk (default is
no chunking)
skiprows : int, optional
Number of initial rows to skip from file (default is 0)
out_path : str, optional
Path to output data, valid mask and attributes (default is
no output)
Returns
-------
output : object
Attributes data, mask and atts contain the corresponding
information from the data file.
Note
----
This module can also be run as a script, with the keyword arguments
as name_arg=arg
"""
logging
.
basicConfig
(
format
=
'%(levelname)s
\t
[%(asctime)s](%(filename)s)
\t
%(message)s'
,
level
=
logging
.
INFO
,
datefmt
=
'%Y%m%d %H:%M:%S'
,
filename
=
None
)
...
...
@@ -209,6 +313,22 @@ def read(source, data_model = None, data_model_path = None, sections = None,chun
# 5. RETURN DATA
class
output
():
""" Class to represent reader output
Attributes
----------
data : str
a pandas.DataFrame or pandas.io.parsers.TextFileReader
with the output data
atts : dict
a dictionary with the output data elements attributes
mask : str
a pandas.DataFrame or pandas.io.parsers.TextFileReader
with the output data validation mask
"""
def
__init__
(
self
):
self
.
data
=
data
self
.
atts
=
out_atts
...
...
@@ -221,4 +341,4 @@ if __name__=='__main__':
kwargs
=
dict
(
arg
.
split
(
'='
)
for
arg
in
sys
.
argv
[
2
:])
if
'sections'
in
kwargs
.
keys
():
kwargs
.
update
({
'sections'
:
[
x
.
strip
()
for
x
in
kwargs
.
get
(
'sections'
).
split
(
","
)]
})
read
(
sys
.
argv
[
1
],
**
kwargs
)
# kwargs
main
(
sys
.
argv
[
1
],
**
kwargs
)
# kwargs
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment