Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
6b4baf21
Commit
6b4baf21
authored
5 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Added docstrings
parent
64acfe65
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
270 additions
and
128 deletions
+270
-128
data_models/code_tables.py
data_models/code_tables.py
+77
-22
data_models/schemas.py
data_models/schemas.py
+2
-2
read.py
read.py
+1
-2
reader/get_sections.py
reader/get_sections.py
+61
-33
reader/import_data.py
reader/import_data.py
+67
-36
reader/read_sections.py
reader/read_sections.py
+62
-33
No files found.
data_models/code_tables.py
View file @
6b4baf21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 13 15:14:51 2018
This module has functions to manage data model
code table files and objects according to the
requirements of the data reader tool
"""
import
sys
...
...
@@ -16,12 +20,6 @@ from copy import deepcopy
from
pandas.io.json.normalize
import
nested_to_record
import
ast
if
sys
.
version_info
[
0
]
>=
3
:
py3
=
True
else
:
py3
=
False
#https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values
#def print_nested(d):
# if isinstance(d, dict):
...
...
@@ -35,15 +33,87 @@ else:
#
# else:
# print(d)
toolPath
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
table_lib
=
os
.
path
.
join
(
toolPath
,
'lib'
)
templates_path
=
os
.
path
.
join
(
table_lib
,
'templates'
,
'code_tables'
)
def
read_table
(
table_path
):
"""
Reads a data model code table file to a dictionary.
It completes the code table to the full complexity
the data reader expects, by appending information
on secondary keys and expanding range keys.
Arguments
---------
table_path : str
The file path of the code table.
Returns
-------
dict
Code table
"""
with
open
(
table_path
)
as
fileObj
:
table
=
json
.
load
(
fileObj
)
# Add keys for nested code tables
keys_path
=
"."
.
join
([
"."
.
join
(
table_path
.
split
(
'.'
)[:
-
1
]),
'keys'
])
if
os
.
path
.
isfile
(
keys_path
):
with
open
(
keys_path
)
as
fileObj
:
table_keys
=
json
.
load
(
fileObj
)
table
[
'_keys'
]
=
{}
for
x
,
y
in
table_keys
.
items
():
key
=
eval_dict_items
(
x
)
values
=
[
eval_dict_items
(
k
)
for
k
in
y
]
table
[
'_keys'
][
key
]
=
values
# Expand range keys
expand_integer_range_key
(
table
)
return
table
def
templates
():
"""
Lists the name of the available code table templates
Returns
-------
list
Code table template aliases
"""
tables
=
glob
.
glob
(
os
.
path
.
join
(
templates_path
,
'*.json'
))
return
[
os
.
path
.
basename
(
x
).
split
(
"."
)[
0
]
for
x
in
tables
]
def
copy_template
(
table
,
out_dir
=
None
,
out_path
=
None
):
"""
Copies a code table template to an output
file or path
Parameters
----------
table : str
Code table template name to copy
Keyword Arguments
-----------------
out_dir : dict, opt
Directory to copy code table file template to
out_path : dict, opt
Full filename to copy code table file template to
Either out_dir or out_path must be provided
"""
tables
=
templates
()
if
table
in
tables
:
table_path
=
os
.
path
.
join
(
templates_path
,
table
+
'.json'
)
...
...
@@ -110,21 +180,6 @@ def eval_dict_items(item):
except
:
return
item
def
read_table
(
table_path
):
with
open
(
table_path
)
as
fileObj
:
table
=
json
.
load
(
fileObj
)
keys_path
=
"."
.
join
([
"."
.
join
(
table_path
.
split
(
'.'
)[:
-
1
]),
'keys'
])
if
os
.
path
.
isfile
(
keys_path
):
with
open
(
keys_path
)
as
fileObj
:
table_keys
=
json
.
load
(
fileObj
)
table
[
'_keys'
]
=
{}
for
x
,
y
in
table_keys
.
items
():
key
=
eval_dict_items
(
x
)
values
=
[
eval_dict_items
(
k
)
for
k
in
y
]
table
[
'_keys'
][
key
]
=
values
expand_integer_range_key
(
table
)
return
table
def
table_keys
(
table
):
separator
=
'∿'
# something hopefully not in keys...
if
table
.
get
(
'_keys'
):
...
...
This diff is collapsed.
Click to expand it.
data_models/schemas.py
View file @
6b4baf21
...
...
@@ -209,8 +209,8 @@ def templates():
def
copy_template
(
schema
,
out_dir
=
None
,
out_path
=
None
):
"""
C
reat
es a s
imple attribute dictionary for the elements
in a dataframe from its data model schema
C
opi
es a s
chema file template to an output
file or path
Parameters
----------
...
...
This diff is collapsed.
Click to expand it.
read.py
View file @
6b4baf21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Manages the integral sequence in data file reading
from a data model:
- Access to data model
...
...
@@ -11,7 +11,6 @@ from a data model:
- Output
Contains the following functions:
* ERV - does the actual extraction, read and validation of data input data
* main - the main function of the script
...
...
This diff is collapsed.
Click to expand it.
reader/get_sections.py
View file @
6b4baf21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 09:38:17 2019
Splits string reports in sections using a data model layout.
Input and output are simple pandas dataframes, with the output dataframe
column names being the section names
To work with a pandas TextParser, loop through this module.
Internally works assuming highest complexity in the input data model:
multiple non sequential sections
DEV NOTES:
1) make sure we use Series when working with Series, DataFrames otherwise...
like now:
threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
instead of:
threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
on data import in import_data.py, we use pd.read_fwf because is more general
use, also support to chunking would make converting to series a bit dirty...
2) Can we extend (do we need to?) this to reading sequential sections with
no sentinals? apparently (see td11) we are already able to do that:
provided the section is in a sequential parsing_order group
@author: iregon
Have to documents the threads approach!!!!
"""
#
"""
#
Created on Tue Apr 30 09:38:17 2019
#
#
Splits string reports in sections using a data model layout.
#
#
Input and output are simple pandas dataframes, with the output dataframe
#
column names being the section names
#
#
To work with a pandas TextParser, loop through this module.
#
#
Internally works assuming highest complexity in the input data model:
#
multiple non sequential sections
#
#
DEV NOTES:
#
#
1) make sure we use Series when working with Series, DataFrames otherwise...
#
like now:
#
threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
#
instead of:
#
threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
#
#
on data import in import_data.py, we use pd.read_fwf because is more general
#
use, also support to chunking would make converting to series a bit dirty...
#
#
2) Can we extend (do we need to?) this to reading sequential sections with
#
no sentinals? apparently (see td11) we are already able to do that:
#
provided the section is in a sequential parsing_order group
#
#
@author: iregon
#
#
Have to documents the threads approach!!!!
#
#
"""
import
pandas
as
pd
from
copy
import
deepcopy
...
...
@@ -200,6 +200,34 @@ def extract_sections(string_df):
# MAIN
# ---------------------------------------------------------------------------
def
main
(
string_df
,
schema
,
read_sections
):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
string_df : pandas.DataFrame
Pandas dataframe with a unique column with
the reports as a block string
schema : dict
Data source data model schema
read_sections : list
Sections to output from the complete report
Returns
-------
pandas.DataFrame
Dataframe with the report sections split
along the columns.
"""
global
sentinals
,
section_lens
,
sentinals_lens
global
parsing_order
# Proceed to split sections if more than one
...
...
This diff is collapsed.
Click to expand it.
reader/import_data.py
View file @
6b4baf21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
AN ITERABLE WITH DATAFRAMES
INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
- a single dataframe in a list
- a pd.io.parsers.textfilereader
WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
delimiter="
\t
" option in pandas.read_fwf avoids white spaces at tails
to be stripped
@author: iregon
OPTIONS IN OLD DEVELOPMENT:
1. DLMT: delimiter = ',' default
names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
skiprows = skiprows, names = names, na_values = missing)
2. FWF:# delimiter = '
\t
' so that it reads blanks as blanks, otherwise reads as empty: NaN
this applies mainly when reading elements from sections, but we leave it also here
TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="
\t
", chunksize = chunksize)
"""
#
"""
#
Created on Fri Jan 10 13:17:43 2020
#
#
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
#
AN ITERABLE WITH DATAFRAMES
#
#
INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
#
#
OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
#
- a single dataframe in a list
#
- a pd.io.parsers.textfilereader
#
#
#
WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
#
#
delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
#
to be stripped
#
#
@author: iregon
#
#
#
#
OPTIONS IN OLD DEVELOPMENT:
#
1. DLMT: delimiter = ',' default
#
names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
#
missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
#
TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
#
dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
#
skiprows = skiprows, names = names, na_values = missing)
#
#
2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
#
this applies mainly when reading elements from sections, but we leave it also here
#
TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
#
#
"""
import
pandas
as
pd
import
os
...
...
@@ -42,7 +42,38 @@ import os
from
..
import
properties
def
main
(
source
,
chunksize
=
None
,
skiprows
=
None
):
"""
Returns an iterable object with a pandas dataframe from
an input data source. The pandas dataframe has a report
per row and a single column with the full report as a
block string.
Currently only supports a data file path as source data,
but could be easily extended to accept a different
source object.
Parameters
----------
source : str
Path to data file
Keyword Arguments
-----------------
chunksize : int, opt
Number of lines to chunk the input data into
skiprows : int, opt
Number of lines to skip from input file
Returns
-------
iterable
List of with a single pandas dataframe
or pandas.io.parsers.textfilereader
"""
if
os
.
path
.
isfile
(
source
):
TextParser
=
pd
.
read_fwf
(
source
,
widths
=
[
properties
.
MAX_FULL_REPORT_WIDTH
],
header
=
None
,
delimiter
=
"
\t
"
,
skiprows
=
skiprows
,
chunksize
=
chunksize
)
if
not
chunksize
:
...
...
This diff is collapsed.
Click to expand it.
reader/read_sections.py
View file @
6b4baf21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020
Extracts and reads (decodes, scales, etc...) the elements of data sections.
Each column of the input dataframe is a section with all its elements stored
as a single string.
Working on a section by section basis, this module uses the data model
information provided in the schema to split the elements, decode and scale them
where appropriate and ensure its data type consistency.
Output is a dataframe with columns as follows depending on the data model
structure:
1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
2) Data model with no sections[element0...element1]
DEV NOTES:
1) the 'quoted' issue: in version 1.0:
# Writing options from quoting on to prevent supp buoy data to be quoted:
# maybe this happenned because buoy data has commas, and pandas makes its own decission about
# how to write that.....
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
# quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="
\\
",sep="
\t
")
But we were still experiencing problems when reading fully empty sections, now
we only write to the section buffer reports that are not empty. We afterwards
recover the indexes....
@author: iregon
"""
#
"""
#
Created on Fri Jan 10 13:17:43 2020
#
#
Extracts and reads (decodes, scales, etc...) the elements of data sections.
#
Each column of the input dataframe is a section with all its elements stored
#
as a single string.
#
#
Working on a section by section basis, this module uses the data model
#
information provided in the schema to split the elements, decode and scale them
#
where appropriate and ensure its data type consistency.
#
#
Output is a dataframe with columns as follows depending on the data model
#
structure:
#
1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
#
2) Data model with no sections[element0...element1]
#
#
#
DEV NOTES:
#
1) the 'quoted' issue: in version 1.0:
#
# Writing options from quoting on to prevent supp buoy data to be quoted:
#
# maybe this happenned because buoy data has commas, and pandas makes its own decission about
#
# how to write that.....
#
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
#
# quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
#
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
#
#
But we were still experiencing problems when reading fully empty sections, now
#
we only write to the section buffer reports that are not empty. We afterwards
#
recover the indexes....
#
#
@author: iregon
#
"""
import
pandas
as
pd
from
io
import
StringIO
as
StringIO
...
...
@@ -80,7 +80,36 @@ def read_data(section_df,section_schema):
return
section_df
,
section_valid
def
main
(
sections_df
,
schema
):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
sections_df : pandas.DataFrame
Pandas dataframe with a column per report sections.
The sections in the columns as a block strings.
schema : dict
Data source data model schema
Returns
-------
data : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
mask : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
dtypes : dict
Dictionary with pandas data types for each of the
output elements
"""
multiindex
=
True
if
len
(
sections_df
.
columns
)
>
1
or
sections_df
.
columns
[
0
]
!=
properties
.
dummy_level
else
False
data_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
valid_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment