Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
c0251df1
Commit
c0251df1
authored
5 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Main functions as main in reader modules
parent
d510abd4
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
34 additions
and
33 deletions
+34
-33
read.py
read.py
+6
-6
reader/get_sections.py
reader/get_sections.py
+1
-1
reader/import_data.py
reader/import_data.py
+1
-1
reader/read_sections.py
reader/read_sections.py
+26
-25
No files found.
read.py
View file @
c0251df1
...
...
@@ -31,9 +31,9 @@ from io import StringIO as StringIO
from
.data_models
import
schemas
from
.
import
properties
from
.common
import
pandas_TextParser_hdlr
from
.reader
import
import_data
from
.reader
import
get_sections
from
.
reader.read
_sections
import
main
as
read_sections
#
from .reader import import_data
#
from .reader import get_sections
from
mdf_
reader.read
er
import
import_data
,
get_sections
,
read_sections
from
.validator
import
validate
toolPath
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
...
...
@@ -77,13 +77,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# - requested NA sections as NaN columns
# - columns(sections) order as in read_sections_list
sections_df
=
get_sections
.
get_sections
(
string_df
,
schema
,
read_sections_list
)
sections_df
=
get_sections
.
main
(
string_df
,
schema
,
read_sections_list
)
# 2. Read elements from sections: along data chunks, resulting data types
# may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this!
# Sections are parsed in the same order as sections_df.columns
[
data_df
,
valid_df
,
out_dtypesi
]
=
read_sections
(
sections_df
,
schema
)
[
data_df
,
valid_df
,
out_dtypesi
]
=
read_sections
.
main
(
sections_df
,
schema
)
if
i_chunk
==
0
:
out_dtypes
=
copy
.
deepcopy
(
out_dtypesi
)
...
...
@@ -271,7 +271,7 @@ def main(source, data_model = None, data_model_path = None, sections = None,chun
# 2.2 Homogeneize input data to an iterable with dataframes:
# a list with a single dataframe or a pd.io.parsers.TextFileReader
logging
.
info
(
"Getting data string from source..."
)
TextParser
=
import_data
.
import_data
(
source
,
chunksize
=
chunksize
,
skiprows
=
skiprows
)
TextParser
=
import_data
.
main
(
source
,
chunksize
=
chunksize
,
skiprows
=
skiprows
)
# 2.3. Extract, read and validate data in same loop
logging
.
info
(
"Extracting and reading sections"
)
...
...
This diff is collapsed.
Click to expand it.
reader/get_sections.py
View file @
c0251df1
...
...
@@ -199,7 +199,7 @@ def extract_sections(string_df):
# ---------------------------------------------------------------------------
# MAIN
# ---------------------------------------------------------------------------
def
get_sections
(
string_df
,
schema
,
read_sections
):
def
main
(
string_df
,
schema
,
read_sections
):
global
sentinals
,
section_lens
,
sentinals_lens
global
parsing_order
# Proceed to split sections if more than one
...
...
This diff is collapsed.
Click to expand it.
reader/import_data.py
View file @
c0251df1
...
...
@@ -41,7 +41,7 @@ import os
from
..
import
properties
def
import_data
(
source
,
chunksize
=
None
,
skiprows
=
None
):
def
main
(
source
,
chunksize
=
None
,
skiprows
=
None
):
if
os
.
path
.
isfile
(
source
):
TextParser
=
pd
.
read_fwf
(
source
,
widths
=
[
properties
.
MAX_FULL_REPORT_WIDTH
],
header
=
None
,
delimiter
=
"
\t
"
,
skiprows
=
skiprows
,
chunksize
=
chunksize
)
...
...
This diff is collapsed.
Click to expand it.
reader/read_sections.py
View file @
c0251df1
...
...
@@ -7,7 +7,7 @@ Extracts and reads (decodes, scales, etc...) the elements of data sections.
Each column of the input dataframe is a section with all its elements stored
as a single string.
Working on a section by section basis, this module uses the data model
Working on a section by section basis, this module uses the data model
information provided in the schema to split the elements, decode and scale them
where appropriate and ensure its data type consistency.
...
...
@@ -24,12 +24,12 @@ DEV NOTES:
# how to write that.....
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
# quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="
\\
",sep="
\t
")
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="
\\
",sep="
\t
")
But we were still experiencing problems when reading fully empty sections, now
we only write to the section buffer reports that are not empty. We afterwards
recover the indexes....
@author: iregon
"""
...
...
@@ -50,24 +50,25 @@ def extract_fixed_width(section_serie_bf,section_schema):
section_elements
=
pd
.
read_fwf
(
section_serie_bf
,
widths
=
section_widths
,
header
=
None
,
names
=
section_names
,
na_values
=
section_missing
,
delimiter
=
"
\t
"
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
)
return
section_elements
def
extract_delimited
(
section_serie_bf
,
section_schema
):
def
extract_delimited
(
section_serie_bf
,
section_schema
):
delimiter
=
section_schema
[
'header'
].
get
(
'delimiter'
)
section_names
=
section_schema
[
'elements'
].
keys
()
section_missing
=
{
x
:
section_schema
[
'elements'
][
x
].
get
(
'missing_value'
)
for
x
in
section_names
}
section_elements
=
pd
.
read_csv
(
section_serie_bf
,
header
=
None
,
delimiter
=
delimiter
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
,
names
=
section_names
,
na_values
=
section_missing
)
return
section_elements
def
read_data
(
section_df
,
section_schema
):
def
read_data
(
section_df
,
section_schema
):
section_names
=
section_df
.
columns
section_dtypes
=
{
i
:
section_schema
[
'elements'
][
i
][
'column_type'
]
for
i
in
section_names
}
encoded
=
[
(
x
)
for
x
in
section_names
if
'encoding'
in
section_schema
[
'elements'
][
x
]]
section_encoding
=
{
i
:
section_schema
[
'elements'
][
i
][
'encoding'
]
for
i
in
encoded
}
section_valid
=
pd
.
DataFrame
(
index
=
section_df
.
index
,
columns
=
section_df
.
columns
)
for
element
in
section_dtypes
.
keys
():
print
(
element
)
missing
=
section_df
[
element
].
isna
()
if
element
in
encoded
:
section_df
[
element
]
=
decoders
.
get
(
section_encoding
.
get
(
element
)).
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
])
...
...
@@ -76,29 +77,29 @@ def read_data(section_df,section_schema):
section_df
[
element
]
=
converters
.
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
],
**
kwargs
)
section_valid
[
element
]
=
missing
|
section_df
[
element
].
notna
()
return
section_df
,
section_valid
def
read_sections
(
sections_df
,
schema
):
def
main
(
sections_df
,
schema
):
multiindex
=
True
if
len
(
sections_df
.
columns
)
>
1
or
sections_df
.
columns
[
0
]
!=
properties
.
dummy_level
else
False
data_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
valid_df
=
pd
.
DataFrame
(
index
=
sections_df
.
index
)
out_dtypes
=
dict
()
for
section
in
sections_df
.
columns
:
for
section
in
sections_df
.
columns
:
print
(
'Reading section {}'
.
format
(
section
))
section_schema
=
schema
[
'sections'
].
get
(
section
)
disable_read
=
section_schema
.
get
(
'header'
).
get
(
'disable_read'
)
if
not
disable_read
:
if
not
disable_read
:
field_layout
=
section_schema
.
get
(
'header'
).
get
(
'field_layout'
)
ignore
=
[
i
for
i
in
section_schema
[
'elements'
].
keys
()
if
section_schema
[
'elements'
][
i
].
get
(
'ignore'
)
]
# evals to True if set and true, evals to False if not set or set and false
# Get rid of false delimiters in fixed_width
delimiter
=
section_schema
[
'header'
].
get
(
'delimiter'
)
if
delimiter
and
field_layout
==
'fixed_width'
:
sections_df
[
section
]
=
sections_df
[
section
].
str
.
replace
(
delimiter
,
''
)
section_buffer
=
StringIO
()
# Here indices are lost, have to give the real ones, those in section_strings:
# we'll see if we do that in the caller module or here....
...
...
@@ -112,9 +113,9 @@ def read_sections(sections_df, schema):
section_elements_obj
=
extract_fixed_width
(
section_buffer
,
section_schema
)
elif
field_layout
==
'delimited'
:
section_elements_obj
=
extract_delimited
(
section_buffer
,
section_schema
)
section_elements_obj
.
drop
(
ignore
,
axis
=
1
,
inplace
=
True
)
# Read the objects to their data types and apply decoding, scaling and so on...
# Give them their actual indexes back
section_elements
,
section_valid
=
read_data
(
section_elements_obj
,
section_schema
)
...
...
@@ -124,30 +125,30 @@ def read_sections(sections_df, schema):
else
:
section_elements
=
pd
.
DataFrame
(
sections_df
[
section
],
columns
=
[
section
])
section_valid
=
pd
.
DataFrame
(
index
=
section_elements
.
index
,
data
=
True
,
columns
=
[
section
])
section_elements
.
columns
=
[
(
section
,
x
)
for
x
in
section_elements
.
columns
]
if
multiindex
else
section_elements
.
columns
section_valid
.
columns
=
section_elements
.
columns
data_df
=
pd
.
concat
([
data_df
,
section_elements
],
sort
=
False
,
axis
=
1
)
valid_df
=
pd
.
concat
([
valid_df
,
section_valid
],
sort
=
False
,
axis
=
1
)
# We do the actual out_dtypes here: because the full indexing occurs only
# after concat, NaN values may arise only in data_df if a section is
# not existing in a given report!
for
section
in
sections_df
.
columns
:
section_schema
=
schema
[
'sections'
].
get
(
section
)
if
not
section_schema
.
get
(
'header'
).
get
(
'disable_read'
):
if
not
section_schema
.
get
(
'header'
).
get
(
'disable_read'
):
elements
=
[
x
[
1
]
for
x
in
data_df
.
columns
if
x
[
0
]
==
section
]
if
multiindex
:
out_dtypes
.
update
({
(
section
,
i
):
properties
.
pandas_dtypes
.
get
(
section_schema
[
'elements'
][
i
].
get
(
'column_type'
))
for
i
in
elements
}
)
out_dtypes
.
update
({
(
section
,
i
):
data_df
[(
section
,
i
)].
dtype
.
name
for
i
in
elements
if
data_df
[(
section
,
i
)].
dtype
.
name
in
properties
.
numpy_floats
})
else
:
out_dtypes
.
update
({
i
:
properties
.
pandas_dtypes
.
get
(
section_schema
[
'elements'
][
i
].
get
(
'column_type'
))
for
i
in
elements
}
)
out_dtypes
.
update
({
i
:
properties
.
pandas_dtypes
.
get
(
section_schema
[
'elements'
][
i
].
get
(
'column_type'
))
for
i
in
elements
}
)
out_dtypes
.
update
({
i
:
data_df
[
i
].
dtype
.
name
for
i
in
section_elements
if
data_df
[
i
].
dtype
.
name
in
properties
.
numpy_floats
})
else
:
if
multiindex
:
out_dtypes
.
update
({
(
section
,
section
):
'object'
}
)
else
:
out_dtypes
.
update
({
section
:
'object'
}
)
return
data_df
,
valid_df
,
out_dtypes
out_dtypes
.
update
({
section
:
'object'
}
)
return
data_df
,
valid_df
,
out_dtypes
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment