Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
8cf8e0d8
Commit
8cf8e0d8
authored
5 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Quoting on special characters fixed
parent
28ac12af
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
9 deletions
+16
-9
properties.py
properties.py
+6
-0
read.py
read.py
+6
-6
reader/read_sections.py
reader/read_sections.py
+4
-3
No files found.
properties.py
View file @
8cf8e0d8
...
@@ -51,3 +51,9 @@ tol = 1E-10
...
@@ -51,3 +51,9 @@ tol = 1E-10
dummy_level
=
'_SECTION_'
dummy_level
=
'_SECTION_'
# Length of reports in initial read
# Length of reports in initial read
MAX_FULL_REPORT_WIDTH
=
100000
MAX_FULL_REPORT_WIDTH
=
100000
# This is a delimiter internally used when writing to buffers
# It is the Unicode Character 'END OF TEXT'
# It is supposed to be safe because we don;t expect it in a string
# It's UTF-8 encoding lenght is not > 1, so it is supported by pandas 'c'
# engine, which is faster than the python engine.
internal_delimiter
=
u
"
\u0003
"
This diff is collapsed.
Click to expand it.
read.py
View file @
8cf8e0d8
...
@@ -26,6 +26,7 @@ import logging
...
@@ -26,6 +26,7 @@ import logging
import
json
import
json
import
copy
import
copy
from
io
import
StringIO
as
StringIO
from
io
import
StringIO
as
StringIO
import
csv
from
.data_models
import
schemas
from
.data_models
import
schemas
from
.
import
properties
from
.
import
properties
...
@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
...
@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# - columns(sections) order as in read_sections_list
# - columns(sections) order as in read_sections_list
sections_df
=
get_sections
.
main
(
string_df
,
schema
,
read_sections_list
)
sections_df
=
get_sections
.
main
(
string_df
,
schema
,
read_sections_list
)
# 2. Read elements from sections
# 2. Read elements from sections
# Along data chunks, resulting data types
# Along data chunks, resulting data types
# may vary if gaps, keep track of data dtypes: v1.0
# may vary if gaps, keep track of data dtypes: v1.0
...
@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
...
@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# Sections are parsed in the same order as sections_df.columns
# Sections are parsed in the same order as sections_df.columns
[
data_df
,
valid_df
,
out_dtypes
]
=
read_sections
.
main
(
sections_df
,
schema
)
[
data_df
,
valid_df
,
out_dtypes
]
=
read_sections
.
main
(
sections_df
,
schema
)
# 3. Validate data elements
# 3. Validate data elements
valid_df
=
validate
.
validate
(
data_df
,
valid_df
,
schema
,
code_tables_path
)
valid_df
=
validate
.
validate
(
data_df
,
valid_df
,
schema
,
code_tables_path
)
# 4. Save to buffer
# 4. Save to buffer
# Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
data_df
.
to_csv
(
data_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
)
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
data_df
.
to_csv
(
data_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
escapechar
=
'
\\
'
,
sep
=
properties
.
internal_delimiter
)
valid_df
.
to_csv
(
valid_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
)
valid_df
.
to_csv
(
valid_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
)
# Create the output
# Create the output
...
@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
...
@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
date_columns
.
append
(
i
)
date_columns
.
append
(
i
)
out_dtypes
.
update
({
element
:
'object'
})
out_dtypes
.
update
({
element
:
'object'
})
data
=
pd
.
read_csv
(
data_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
,
dtype
=
out_dtypes
,
parse_dates
=
date_columns
)
data
=
pd
.
read_csv
(
data_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
,
dtype
=
out_dtypes
,
parse_dates
=
date_columns
,
delimiter
=
properties
.
internal_delimiter
)
valid
=
pd
.
read_csv
(
valid_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
)
valid
=
pd
.
read_csv
(
valid_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
)
return
data
,
valid
return
data
,
valid
...
...
This diff is collapsed.
Click to expand it.
reader/read_sections.py
View file @
8cf8e0d8
...
@@ -35,6 +35,7 @@
...
@@ -35,6 +35,7 @@
import
pandas
as
pd
import
pandas
as
pd
from
io
import
StringIO
as
StringIO
from
io
import
StringIO
as
StringIO
import
csv
from
..
import
properties
from
..
import
properties
from
..common.converters
import
converters
from
..common.converters
import
converters
...
@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema):
...
@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema):
section_missing
=
{
i
:
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
)
if
section_schema
[
'elements'
][
i
].
get
(
'disable_white_strip'
)
==
True
section_missing
=
{
i
:
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
)
if
section_schema
[
'elements'
][
i
].
get
(
'disable_white_strip'
)
==
True
else
[
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
),
" "
*
section_schema
[
'elements'
][
i
].
get
(
'field_length'
,
properties
.
MAX_FULL_REPORT_WIDTH
)]
else
[
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
),
" "
*
section_schema
[
'elements'
][
i
].
get
(
'field_length'
,
properties
.
MAX_FULL_REPORT_WIDTH
)]
for
i
in
section_names
}
for
i
in
section_names
}
section_elements
=
pd
.
read_fwf
(
section_serie_bf
,
widths
=
section_widths
,
header
=
None
,
names
=
section_names
,
na_values
=
section_missing
,
delimiter
=
"
\t
"
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
)
section_elements
=
pd
.
read_fwf
(
section_serie_bf
,
widths
=
section_widths
,
header
=
None
,
names
=
section_names
,
na_values
=
section_missing
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
)
return
section_elements
return
section_elements
def
extract_delimited
(
section_serie_bf
,
section_schema
):
def
extract_delimited
(
section_serie_bf
,
section_schema
):
...
@@ -71,10 +72,10 @@ def read_data(section_df,section_schema):
...
@@ -71,10 +72,10 @@ def read_data(section_df,section_schema):
missing
=
section_df
[
element
].
isna
()
missing
=
section_df
[
element
].
isna
()
if
element
in
encoded
:
if
element
in
encoded
:
section_df
[
element
]
=
decoders
.
get
(
section_encoding
.
get
(
element
)).
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
])
section_df
[
element
]
=
decoders
.
get
(
section_encoding
.
get
(
element
)).
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
])
kwargs
=
{
converter_arg
:
section_schema
[
'elements'
][
element
].
get
(
converter_arg
)
for
converter_arg
in
properties
.
data_type_conversion_args
.
get
(
section_dtypes
.
get
(
element
))
}
kwargs
=
{
converter_arg
:
section_schema
[
'elements'
][
element
].
get
(
converter_arg
)
for
converter_arg
in
properties
.
data_type_conversion_args
.
get
(
section_dtypes
.
get
(
element
))
}
section_df
[
element
]
=
converters
.
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
],
**
kwargs
)
section_df
[
element
]
=
converters
.
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
],
**
kwargs
)
section_valid
[
element
]
=
missing
|
section_df
[
element
].
notna
()
section_valid
[
element
]
=
missing
|
section_df
[
element
].
notna
()
return
section_df
,
section_valid
return
section_df
,
section_valid
...
@@ -133,7 +134,7 @@ def main(sections_df, schema):
...
@@ -133,7 +134,7 @@ def main(sections_df, schema):
# Only pass records with data to avoid the hassle of dealing with
# Only pass records with data to avoid the hassle of dealing with
# how the NaN rows are written and then read!
# how the NaN rows are written and then read!
notna_idx
=
sections_df
[
sections_df
[
section
].
notna
()].
index
notna_idx
=
sections_df
[
sections_df
[
section
].
notna
()].
index
sections_df
[
section
].
loc
[
notna_idx
].
to_csv
(
section_buffer
,
header
=
False
,
encoding
=
'utf-8'
,
index
=
False
)
sections_df
[
section
].
loc
[
notna_idx
].
to_csv
(
section_buffer
,
header
=
False
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
escapechar
=
'
\\
'
,
sep
=
properties
.
internal_delimiter
)
ssshh
=
section_buffer
.
seek
(
0
)
ssshh
=
section_buffer
.
seek
(
0
)
# Get the individual elements as objects
# Get the individual elements as objects
if
field_layout
==
'fixed_width'
:
if
field_layout
==
'fixed_width'
:
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment