Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
6b4baf21
Commit
6b4baf21
authored
5 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Added docstrings
parent
64acfe65
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
270 additions
and
128 deletions
+270
-128
data_models/code_tables.py
data_models/code_tables.py
+77
-22
data_models/schemas.py
data_models/schemas.py
+2
-2
read.py
read.py
+1
-2
reader/get_sections.py
reader/get_sections.py
+61
-33
reader/import_data.py
reader/import_data.py
+67
-36
reader/read_sections.py
reader/read_sections.py
+62
-33
No files found.
data_models/code_tables.py
View file @
6b4baf21
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
"""
Created on Thu Sep 13 15:14:51 2018
This module has functions to manage data model
code table files and objects according to the
requirements of the data reader tool
"""
"""
import
sys
import
sys
...
@@ -16,12 +20,6 @@ from copy import deepcopy
...
@@ -16,12 +20,6 @@ from copy import deepcopy
from
pandas.io.json.normalize
import
nested_to_record
from
pandas.io.json.normalize
import
nested_to_record
import
ast
import
ast
if
sys
.
version_info
[
0
]
>=
3
:
py3
=
True
else
:
py3
=
False
#https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values
#https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values
#def print_nested(d):
#def print_nested(d):
# if isinstance(d, dict):
# if isinstance(d, dict):
...
@@ -35,15 +33,87 @@ else:
...
@@ -35,15 +33,87 @@ else:
#
#
# else:
# else:
# print(d)
# print(d)
toolPath
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
toolPath
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
table_lib
=
os
.
path
.
join
(
toolPath
,
'lib'
)
table_lib
=
os
.
path
.
join
(
toolPath
,
'lib'
)
templates_path
=
os
.
path
.
join
(
table_lib
,
'templates'
,
'code_tables'
)
templates_path
=
os
.
path
.
join
(
table_lib
,
'templates'
,
'code_tables'
)
def
read_table
(
table_path
):
"""
Reads a data model code table file to a dictionary.
It completes the code table to the full complexity
the data reader expects, by appending information
on secondary keys and expanding range keys.
Arguments
---------
table_path : str
The file path of the code table.
Returns
-------
dict
Code table
"""
with
open
(
table_path
)
as
fileObj
:
table
=
json
.
load
(
fileObj
)
# Add keys for nested code tables
keys_path
=
"."
.
join
([
"."
.
join
(
table_path
.
split
(
'.'
)[:
-
1
]),
'keys'
])
if
os
.
path
.
isfile
(
keys_path
):
with
open
(
keys_path
)
as
fileObj
:
table_keys
=
json
.
load
(
fileObj
)
table
[
'_keys'
]
=
{}
for
x
,
y
in
table_keys
.
items
():
key
=
eval_dict_items
(
x
)
values
=
[
eval_dict_items
(
k
)
for
k
in
y
]
table
[
'_keys'
][
key
]
=
values
# Expand range keys
expand_integer_range_key
(
table
)
return
table
def
templates
():
def
templates
():
"""
Lists the name of the available code table templates
Returns
-------
list
Code table template aliases
"""
tables
=
glob
.
glob
(
os
.
path
.
join
(
templates_path
,
'*.json'
))
tables
=
glob
.
glob
(
os
.
path
.
join
(
templates_path
,
'*.json'
))
return
[
os
.
path
.
basename
(
x
).
split
(
"."
)[
0
]
for
x
in
tables
]
return
[
os
.
path
.
basename
(
x
).
split
(
"."
)[
0
]
for
x
in
tables
]
def
copy_template
(
table
,
out_dir
=
None
,
out_path
=
None
):
def
copy_template
(
table
,
out_dir
=
None
,
out_path
=
None
):
"""
Copies a code table template to an output
file or path
Parameters
----------
table : str
Code table template name to copy
Keyword Arguments
-----------------
out_dir : dict, opt
Directory to copy code table file template to
out_path : dict, opt
Full filename to copy code table file template to
Either out_dir or out_path must be provided
"""
tables
=
templates
()
tables
=
templates
()
if
table
in
tables
:
if
table
in
tables
:
table_path
=
os
.
path
.
join
(
templates_path
,
table
+
'.json'
)
table_path
=
os
.
path
.
join
(
templates_path
,
table
+
'.json'
)
...
@@ -110,21 +180,6 @@ def eval_dict_items(item):
...
@@ -110,21 +180,6 @@ def eval_dict_items(item):
except
:
except
:
return
item
return
item
def
read_table
(
table_path
):
with
open
(
table_path
)
as
fileObj
:
table
=
json
.
load
(
fileObj
)
keys_path
=
"."
.
join
([
"."
.
join
(
table_path
.
split
(
'.'
)[:
-
1
]),
'keys'
])
if
os
.
path
.
isfile
(
keys_path
):
with
open
(
keys_path
)
as
fileObj
:
table_keys
=
json
.
load
(
fileObj
)
table
[
'_keys'
]
=
{}
for
x
,
y
in
table_keys
.
items
():
key
=
eval_dict_items
(
x
)
values
=
[
eval_dict_items
(
k
)
for
k
in
y
]
table
[
'_keys'
][
key
]
=
values
expand_integer_range_key
(
table
)
return
table
def
table_keys
(
table
):
def
table_keys
(
table
):
separator
=
'∿'
# something hopefully not in keys...
separator
=
'∿'
# something hopefully not in keys...
if
table
.
get
(
'_keys'
):
if
table
.
get
(
'_keys'
):
...
...
This diff is collapsed.
Click to expand it.
data_models/schemas.py
View file @
6b4baf21
...
@@ -209,8 +209,8 @@ def templates():
...
@@ -209,8 +209,8 @@ def templates():
def
copy_template
(
schema
,
out_dir
=
None
,
out_path
=
None
):
def
copy_template
(
schema
,
out_dir
=
None
,
out_path
=
None
):
"""
"""
C
reat
es a s
imple attribute dictionary for the elements
C
opi
es a s
chema file template to an output
in a dataframe from its data model schema
file or path
Parameters
Parameters
----------
----------
...
...
This diff is collapsed.
Click to expand it.
read.py
View file @
6b4baf21
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
"""
Manages the integral sequence in data file reading
Manages the integral sequence in data file reading
from a data model:
from a data model:
- Access to data model
- Access to data model
...
@@ -11,7 +11,6 @@ from a data model:
...
@@ -11,7 +11,6 @@ from a data model:
- Output
- Output
Contains the following functions:
Contains the following functions:
* ERV - does the actual extraction, read and validation of data input data
* ERV - does the actual extraction, read and validation of data input data
* main - the main function of the script
* main - the main function of the script
...
...
This diff is collapsed.
Click to expand it.
reader/get_sections.py
View file @
6b4baf21
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
#
"""
Created on Tue Apr 30 09:38:17 2019
#
Created on Tue Apr 30 09:38:17 2019
#
Splits string reports in sections using a data model layout.
#
Splits string reports in sections using a data model layout.
#
Input and output are simple pandas dataframes, with the output dataframe
#
Input and output are simple pandas dataframes, with the output dataframe
column names being the section names
#
column names being the section names
#
To work with a pandas TextParser, loop through this module.
#
To work with a pandas TextParser, loop through this module.
#
Internally works assuming highest complexity in the input data model:
#
Internally works assuming highest complexity in the input data model:
multiple non sequential sections
#
multiple non sequential sections
#
DEV NOTES:
#
DEV NOTES:
#
1) make sure we use Series when working with Series, DataFrames otherwise...
#
1) make sure we use Series when working with Series, DataFrames otherwise...
like now:
#
like now:
threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
#
threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
instead of:
#
instead of:
threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
#
threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
#
on data import in import_data.py, we use pd.read_fwf because is more general
#
on data import in import_data.py, we use pd.read_fwf because is more general
use, also support to chunking would make converting to series a bit dirty...
#
use, also support to chunking would make converting to series a bit dirty...
#
2) Can we extend (do we need to?) this to reading sequential sections with
#
2) Can we extend (do we need to?) this to reading sequential sections with
no sentinals? apparently (see td11) we are already able to do that:
#
no sentinals? apparently (see td11) we are already able to do that:
provided the section is in a sequential parsing_order group
#
provided the section is in a sequential parsing_order group
#
@author: iregon
#
@author: iregon
#
Have to documents the threads approach!!!!
#
Have to documents the threads approach!!!!
#
"""
#
"""
import
pandas
as
pd
import
pandas
as
pd
from
copy
import
deepcopy
from
copy
import
deepcopy
...
@@ -200,6 +200,34 @@ def extract_sections(string_df):
...
@@ -200,6 +200,34 @@ def extract_sections(string_df):
# MAIN
# MAIN
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
def
main
(
string_df
,
schema
,
read_sections
):
def
main
(
string_df
,
schema
,
read_sections
):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
string_df : pandas.DataFrame
Pandas dataframe with a unique column with
the reports as a block string
schema : dict
Data source data model schema
read_sections : list
Sections to output from the complete report
Returns
-------
pandas.DataFrame
Dataframe with the report sections split
along the columns.
"""
global
sentinals
,
section_lens
,
sentinals_lens
global
sentinals
,
section_lens
,
sentinals_lens
global
parsing_order
global
parsing_order
# Proceed to split sections if more than one
# Proceed to split sections if more than one
...
...
This diff is collapsed.
Click to expand it.
reader/import_data.py
View file @
6b4baf21
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
#
"""
Created on Fri Jan 10 13:17:43 2020
#
Created on Fri Jan 10 13:17:43 2020
#
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
#
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
AN ITERABLE WITH DATAFRAMES
#
AN ITERABLE WITH DATAFRAMES
#
INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
#
INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
#
OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
#
OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
- a single dataframe in a list
#
- a single dataframe in a list
- a pd.io.parsers.textfilereader
#
- a pd.io.parsers.textfilereader
#
#
WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
#
WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
#
delimiter="
\t
" option in pandas.read_fwf avoids white spaces at tails
#
delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
to be stripped
#
to be stripped
#
@author: iregon
#
@author: iregon
#
#
#
OPTIONS IN OLD DEVELOPMENT:
#
OPTIONS IN OLD DEVELOPMENT:
1. DLMT: delimiter = ',' default
#
1. DLMT: delimiter = ',' default
names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
#
names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
#
missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
#
TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
#
dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
skiprows = skiprows, names = names, na_values = missing)
#
skiprows = skiprows, names = names, na_values = missing)
#
2. FWF:# delimiter = '
\t
' so that it reads blanks as blanks, otherwise reads as empty: NaN
#
2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
this applies mainly when reading elements from sections, but we leave it also here
#
this applies mainly when reading elements from sections, but we leave it also here
TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="
\t
", chunksize = chunksize)
#
TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
#
"""
#
"""
import
pandas
as
pd
import
pandas
as
pd
import
os
import
os
...
@@ -42,7 +42,38 @@ import os
...
@@ -42,7 +42,38 @@ import os
from
..
import
properties
from
..
import
properties
def
main
(
source
,
chunksize
=
None
,
skiprows
=
None
):
def
main
(
source
,
chunksize
=
None
,
skiprows
=
None
):
"""
Returns an iterable object with a pandas dataframe from
an input data source. The pandas dataframe has a report
per row and a single column with the full report as a
block string.
Currently only supports a data file path as source data,
but could be easily extended to accept a different
source object.
Parameters
----------
source : str
Path to data file
Keyword Arguments
-----------------
chunksize : int, opt
Number of lines to chunk the input data into
skiprows : int, opt
Number of lines to skip from input file
Returns
-------
iterable
List of with a single pandas dataframe
or pandas.io.parsers.textfilereader
"""
if
os
.
path
.
isfile
(
source
):
if
os
.
path
.
isfile
(
source
):
TextParser
=
pd
.
read_fwf
(
source
,
widths
=
[
properties
.
MAX_FULL_REPORT_WIDTH
],
header
=
None
,
delimiter
=
"
\t
"
,
skiprows
=
skiprows
,
chunksize
=
chunksize
)
TextParser
=
pd
.
read_fwf
(
source
,
widths
=
[
properties
.
MAX_FULL_REPORT_WIDTH
],
header
=
None
,
delimiter
=
"
\t
"
,
skiprows
=
skiprows
,
chunksize
=
chunksize
)
if
not
chunksize
:
if
not
chunksize
:
...
...
This diff is collapsed.
Click to expand it.
reader/read_sections.py
View file @
6b4baf21
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
#
"""
Created on Fri Jan 10 13:17:43 2020
#
Created on Fri Jan 10 13:17:43 2020
#
Extracts and reads (decodes, scales, etc...) the elements of data sections.
#
Extracts and reads (decodes, scales, etc...) the elements of data sections.
Each column of the input dataframe is a section with all its elements stored
#
Each column of the input dataframe is a section with all its elements stored
as a single string.
#
as a single string.
#
Working on a section by section basis, this module uses the data model
#
Working on a section by section basis, this module uses the data model
information provided in the schema to split the elements, decode and scale them
#
information provided in the schema to split the elements, decode and scale them
where appropriate and ensure its data type consistency.
#
where appropriate and ensure its data type consistency.
#
Output is a dataframe with columns as follows depending on the data model
#
Output is a dataframe with columns as follows depending on the data model
structure:
#
structure:
1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
#
1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
2) Data model with no sections[element0...element1]
#
2) Data model with no sections[element0...element1]
#
#
DEV NOTES:
#
DEV NOTES:
1) the 'quoted' issue: in version 1.0:
#
1) the 'quoted' issue: in version 1.0:
# Writing options from quoting on to prevent supp buoy data to be quoted:
#
# Writing options from quoting on to prevent supp buoy data to be quoted:
# maybe this happenned because buoy data has commas, and pandas makes its own decission about
#
# maybe this happenned because buoy data has commas, and pandas makes its own decission about
# how to write that.....
#
# how to write that.....
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
#
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
# quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
#
# quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="
\\
",sep="
\t
")
#
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
#
But we were still experiencing problems when reading fully empty sections, now
#
But we were still experiencing problems when reading fully empty sections, now
we only write to the section buffer reports that are not empty. We afterwards
#
we only write to the section buffer reports that are not empty. We afterwards
recover the indexes....
#
recover the indexes....
#
@author: iregon
#
@author: iregon
"""
#
"""
import
pandas
as
pd
import
pandas
as
pd
from
io
import
StringIO
as
StringIO
from
io
import
StringIO
as
StringIO
...
@@ -80,7 +80,36 @@ def read_data(section_df,section_schema):
...
@@ -80,7 +80,36 @@ def read_data(section_df,section_schema):
return
section_df
,
section_valid
return
section_df
,
section_valid
def
main
(
sections_df
,
schema
):
def
main
(
sections_df
,
schema
):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
sections_df : pandas.DataFrame
Pandas dataframe with a column per report sections.
The sections in the columns as a block strings.
schema : dict
Data source data model schema
Returns
-------
data : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
mask : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
dtypes : dict
Dictionary with pandas data types for each of the
output elements
"""
multiindex
=
True
if
len
(
sections_df
.
columns
)
>
1
or
sections_df
.
columns
[
0
]
!=
properties
.
dummy_level
else
False
multiindex
=
True
if
len
(
sections_df
.
columns
)
>
1
or
sections_df
.
columns
[
0
]
!=
properties
.
dummy_level
else
False
data_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
data_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
valid_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
valid_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment