Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
8cf8e0d8
Commit
8cf8e0d8
authored
5 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Quoting on special characters fixed
parent
28ac12af
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
9 deletions
+16
-9
properties.py
properties.py
+6
-0
read.py
read.py
+6
-6
reader/read_sections.py
reader/read_sections.py
+4
-3
No files found.
properties.py
View file @
8cf8e0d8
...
...
@@ -51,3 +51,9 @@ tol = 1E-10
dummy_level
=
'_SECTION_'
# Length of reports in initial read
MAX_FULL_REPORT_WIDTH
=
100000
# This is a delimiter internally used when writing to buffers
# It is the Unicode Character 'END OF TEXT'
# It is supposed to be safe because we don;t expect it in a string
# It's UTF-8 encoding lenght is not > 1, so it is supported by pandas 'c'
# engine, which is faster than the python engine.
internal_delimiter
=
u
"
\u0003
"
This diff is collapsed.
Click to expand it.
read.py
View file @
8cf8e0d8
...
...
@@ -26,6 +26,7 @@ import logging
import
json
import
copy
from
io
import
StringIO
as
StringIO
import
csv
from
.data_models
import
schemas
from
.
import
properties
...
...
@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# - columns(sections) order as in read_sections_list
sections_df
=
get_sections
.
main
(
string_df
,
schema
,
read_sections_list
)
# 2. Read elements from sections
# Along data chunks, resulting data types
# may vary if gaps, keep track of data dtypes: v1.0
...
...
@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# Sections are parsed in the same order as sections_df.columns
[
data_df
,
valid_df
,
out_dtypes
]
=
read_sections
.
main
(
sections_df
,
schema
)
# 3. Validate data elements
valid_df
=
validate
.
validate
(
data_df
,
valid_df
,
schema
,
code_tables_path
)
# 4. Save to buffer
data_df
.
to_csv
(
data_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
)
# 4. Save to buffer
# Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
data_df
.
to_csv
(
data_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
escapechar
=
'
\\
'
,
sep
=
properties
.
internal_delimiter
)
valid_df
.
to_csv
(
valid_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
)
# Create the output
...
...
@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
date_columns
.
append
(
i
)
out_dtypes
.
update
({
element
:
'object'
})
data
=
pd
.
read_csv
(
data_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
,
dtype
=
out_dtypes
,
parse_dates
=
date_columns
)
data
=
pd
.
read_csv
(
data_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
,
dtype
=
out_dtypes
,
parse_dates
=
date_columns
,
delimiter
=
properties
.
internal_delimiter
)
valid
=
pd
.
read_csv
(
valid_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
)
return
data
,
valid
...
...
This diff is collapsed.
Click to expand it.
reader/read_sections.py
View file @
8cf8e0d8
...
...
@@ -35,6 +35,7 @@
import
pandas
as
pd
from
io
import
StringIO
as
StringIO
import
csv
from
..
import
properties
from
..common.converters
import
converters
...
...
@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema):
section_missing
=
{
i
:
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
)
if
section_schema
[
'elements'
][
i
].
get
(
'disable_white_strip'
)
==
True
else
[
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
),
" "
*
section_schema
[
'elements'
][
i
].
get
(
'field_length'
,
properties
.
MAX_FULL_REPORT_WIDTH
)]
for
i
in
section_names
}
section_elements
=
pd
.
read_fwf
(
section_serie_bf
,
widths
=
section_widths
,
header
=
None
,
names
=
section_names
,
na_values
=
section_missing
,
delimiter
=
"
\t
"
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
)
section_elements
=
pd
.
read_fwf
(
section_serie_bf
,
widths
=
section_widths
,
header
=
None
,
names
=
section_names
,
na_values
=
section_missing
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
)
return
section_elements
def
extract_delimited
(
section_serie_bf
,
section_schema
):
...
...
@@ -71,10 +72,10 @@ def read_data(section_df,section_schema):
missing
=
section_df
[
element
].
isna
()
if
element
in
encoded
:
section_df
[
element
]
=
decoders
.
get
(
section_encoding
.
get
(
element
)).
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
])
kwargs
=
{
converter_arg
:
section_schema
[
'elements'
][
element
].
get
(
converter_arg
)
for
converter_arg
in
properties
.
data_type_conversion_args
.
get
(
section_dtypes
.
get
(
element
))
}
section_df
[
element
]
=
converters
.
get
(
section_dtypes
.
get
(
element
))(
section_df
[
element
],
**
kwargs
)
section_valid
[
element
]
=
missing
|
section_df
[
element
].
notna
()
return
section_df
,
section_valid
...
...
@@ -133,7 +134,7 @@ def main(sections_df, schema):
# Only pass records with data to avoid the hassle of dealing with
# how the NaN rows are written and then read!
notna_idx
=
sections_df
[
sections_df
[
section
].
notna
()].
index
sections_df
[
section
].
loc
[
notna_idx
].
to_csv
(
section_buffer
,
header
=
False
,
encoding
=
'utf-8'
,
index
=
False
)
sections_df
[
section
].
loc
[
notna_idx
].
to_csv
(
section_buffer
,
header
=
False
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
escapechar
=
'
\\
'
,
sep
=
properties
.
internal_delimiter
)
ssshh
=
section_buffer
.
seek
(
0
)
# Get the individual elements as objects
if
field_layout
==
'fixed_width'
:
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment