Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
brivas
mdf_reader
Commits
5e1c7d4a
Commit
5e1c7d4a
authored
4 years ago
by
iregon
Browse files
Options
Download
Email Patches
Plain Diff
Fixes for quotes and escape chars
parent
ebdc6bf1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
10 deletions
+22
-10
read.py
read.py
+11
-4
reader/import_data.py
reader/import_data.py
+1
-1
reader/read_sections.py
reader/read_sections.py
+10
-5
No files found.
read.py
View file @
5e1c7d4a
...
...
@@ -91,7 +91,9 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# 4. Save to buffer
# Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
data_df
.
to_csv
(
data_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
escapechar
=
'
\\
'
,
sep
=
properties
.
internal_delimiter
)
data_df
.
to_csv
(
data_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
sep
=
properties
.
internal_delimiter
,
quotechar
=
'
\0
'
,
escapechar
=
'
\0
'
)
valid_df
.
to_csv
(
valid_buffer
,
header
=
False
,
mode
=
'a'
,
encoding
=
'utf-8'
,
index
=
False
)
# Create the output
...
...
@@ -112,7 +114,11 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
date_columns
.
append
(
i
)
out_dtypes
.
update
({
element
:
'object'
})
data
=
pd
.
read_csv
(
data_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
,
dtype
=
out_dtypes
,
parse_dates
=
date_columns
,
delimiter
=
properties
.
internal_delimiter
)
data
=
pd
.
read_csv
(
data_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
,
dtype
=
out_dtypes
,
parse_dates
=
date_columns
,
delimiter
=
properties
.
internal_delimiter
,
quotechar
=
'
\0
'
,
escapechar
=
'
\0
'
)
valid
=
pd
.
read_csv
(
valid_buffer
,
names
=
data_df
.
columns
,
chunksize
=
chunksize
)
return
data
,
valid
...
...
@@ -298,8 +304,9 @@ def main(source, data_model = None, data_model_path = None, sections = None,chun
else
:
header
=
cols
out_atts_json
=
out_atts
data_df
.
to_csv
(
os
.
path
.
join
(
out_path
,
'data.csv'
),
header
=
header
,
mode
=
mode
,
encoding
=
'utf-8'
,
index
=
True
,
index_label
=
'index'
)
valid_df
.
to_csv
(
os
.
path
.
join
(
out_path
,
'mask.csv'
),
header
=
header
,
mode
=
mode
,
encoding
=
'utf-8'
,
index
=
True
,
index_label
=
'index'
)
kwargs
=
{
'header'
:
header
,
'mode'
:
mode
,
'encoding'
:
'utf-8'
,
'index'
:
True
,
'index_label'
:
'index'
,
'quotechar'
:
'
\0
'
,
'escapechar'
:
'
\0
'
}
data_df
.
to_csv
(
os
.
path
.
join
(
out_path
,
'data.csv'
),
**
kwargs
)
valid_df
.
to_csv
(
os
.
path
.
join
(
out_path
,
'mask.csv'
),
**
kwargs
)
if
enlisted
:
data
=
data
[
0
]
valid
=
valid
[
0
]
...
...
This diff is collapsed.
Click to expand it.
reader/import_data.py
View file @
5e1c7d4a
...
...
@@ -75,7 +75,7 @@ def main(source,chunksize = None, skiprows = None):
"""
if
os
.
path
.
isfile
(
source
):
TextParser
=
pd
.
read_fwf
(
source
,
widths
=
[
properties
.
MAX_FULL_REPORT_WIDTH
],
header
=
None
,
delimiter
=
"
\t
"
,
skiprows
=
skiprows
,
chunksize
=
chunksize
)
TextParser
=
pd
.
read_fwf
(
source
,
widths
=
[
properties
.
MAX_FULL_REPORT_WIDTH
],
header
=
None
,
delimiter
=
"
\t
"
,
skiprows
=
skiprows
,
chunksize
=
chunksize
,
quotechar
=
'
\0
'
,
escapechar
=
'
\0
'
)
if
not
chunksize
:
TextParser
=
[
TextParser
]
return
TextParser
...
...
This diff is collapsed.
Click to expand it.
reader/read_sections.py
View file @
5e1c7d4a
...
...
@@ -48,16 +48,21 @@ def extract_fixed_width(section_serie_bf,section_schema):
section_missing
=
{
i
:
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
)
if
section_schema
[
'elements'
][
i
].
get
(
'disable_white_strip'
)
==
True
else
[
section_schema
[
'elements'
][
i
].
get
(
'missing_value'
),
" "
*
section_schema
[
'elements'
][
i
].
get
(
'field_length'
,
properties
.
MAX_FULL_REPORT_WIDTH
)]
for
i
in
section_names
}
section_elements
=
pd
.
read_fwf
(
section_serie_bf
,
widths
=
section_widths
,
header
=
None
,
names
=
section_names
,
na_values
=
section_missing
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
)
section_elements
=
pd
.
read_fwf
(
section_serie_bf
,
widths
=
section_widths
,
header
=
None
,
names
=
section_names
,
na_values
=
section_missing
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
,
quotechar
=
'
\0
'
,
escapechar
=
'
\0
'
)
return
section_elements
def
extract_delimited
(
section_serie_bf
,
section_schema
):
delimiter
=
section_schema
[
'header'
].
get
(
'delimiter'
)
section_names
=
section_schema
[
'elements'
].
keys
()
section_missing
=
{
x
:
section_schema
[
'elements'
][
x
].
get
(
'missing_value'
)
for
x
in
section_names
}
section_elements
=
pd
.
read_csv
(
section_serie_bf
,
header
=
None
,
delimiter
=
delimiter
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
,
names
=
section_names
,
na_values
=
section_missing
)
section_elements
=
pd
.
read_csv
(
section_serie_bf
,
header
=
None
,
delimiter
=
delimiter
,
encoding
=
'utf-8'
,
dtype
=
'object'
,
skip_blank_lines
=
False
,
names
=
section_names
,
na_values
=
section_missing
,
quotechar
=
'
\0
'
,
escapechar
=
'
\0
'
)
return
section_elements
...
...
@@ -134,7 +139,7 @@ def main(sections_df, schema):
# Only pass records with data to avoid the hassle of dealing with
# how the NaN rows are written and then read!
notna_idx
=
sections_df
[
sections_df
[
section
].
notna
()].
index
sections_df
[
section
].
loc
[
notna_idx
].
to_csv
(
section_buffer
,
header
=
False
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
escapechar
=
'
\
\
'
,
sep
=
properties
.
internal_delimiter
)
sections_df
[
section
].
loc
[
notna_idx
].
to_csv
(
section_buffer
,
header
=
False
,
encoding
=
'utf-8'
,
index
=
False
,
quoting
=
csv
.
QUOTE_NONE
,
quotechar
=
'
\0
'
,
escapechar
=
'
\
0
'
,
sep
=
properties
.
internal_delimiter
)
ssshh
=
section_buffer
.
seek
(
0
)
# Get the individual elements as objects
if
field_layout
==
'fixed_width'
:
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment