Commit 8cf8e0d8 authored by iregon's avatar iregon
Browse files

Quoting on special characters fixed

parent 28ac12af
...@@ -51,3 +51,9 @@ tol = 1E-10 ...@@ -51,3 +51,9 @@ tol = 1E-10
dummy_level = '_SECTION_' dummy_level = '_SECTION_'
# Length of reports in initial read # Length of reports in initial read
MAX_FULL_REPORT_WIDTH = 100000 MAX_FULL_REPORT_WIDTH = 100000
# This is a delimiter internally used when writing to buffers
# It is the Unicode Character 'END OF TEXT'
# It is supposed to be safe because we don;t expect it in a string
# It's UTF-8 encoding lenght is not > 1, so it is supported by pandas 'c'
# engine, which is faster than the python engine.
internal_delimiter = u"\u0003"
...@@ -26,6 +26,7 @@ import logging ...@@ -26,6 +26,7 @@ import logging
import json import json
import copy import copy
from io import StringIO as StringIO from io import StringIO as StringIO
import csv
from .data_models import schemas from .data_models import schemas
from . import properties from . import properties
...@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): ...@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# - columns(sections) order as in read_sections_list # - columns(sections) order as in read_sections_list
sections_df = get_sections.main(string_df, schema, read_sections_list) sections_df = get_sections.main(string_df, schema, read_sections_list)
# 2. Read elements from sections # 2. Read elements from sections
# Along data chunks, resulting data types # Along data chunks, resulting data types
# may vary if gaps, keep track of data dtypes: v1.0 # may vary if gaps, keep track of data dtypes: v1.0
...@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): ...@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# Sections are parsed in the same order as sections_df.columns # Sections are parsed in the same order as sections_df.columns
[data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema) [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema)
# 3. Validate data elements # 3. Validate data elements
valid_df = validate.validate(data_df, valid_df, schema, code_tables_path) valid_df = validate.validate(data_df, valid_df, schema, code_tables_path)
# 4. Save to buffer
# 4. Save to buffer # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False) #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False) valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
# Create the output # Create the output
...@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): ...@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
date_columns.append(i) date_columns.append(i)
out_dtypes.update({element:'object'}) out_dtypes.update({element:'object'})
data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns) data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter)
valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize) valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)
return data, valid return data, valid
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
import pandas as pd import pandas as pd
from io import StringIO as StringIO from io import StringIO as StringIO
import csv
from .. import properties from .. import properties
from ..common.converters import converters from ..common.converters import converters
...@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema): ...@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema):
section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)] else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
for i in section_names } for i in section_names }
section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False ) section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
return section_elements return section_elements
def extract_delimited(section_serie_bf,section_schema): def extract_delimited(section_serie_bf,section_schema):
...@@ -71,10 +72,10 @@ def read_data(section_df,section_schema): ...@@ -71,10 +72,10 @@ def read_data(section_df,section_schema):
missing = section_df[element].isna() missing = section_df[element].isna()
if element in encoded: if element in encoded:
section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element]) section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])
kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element)) } kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element)) }
section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs) section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)
section_valid[element] = missing | section_df[element].notna() section_valid[element] = missing | section_df[element].notna()
return section_df,section_valid return section_df,section_valid
...@@ -133,7 +134,7 @@ def main(sections_df, schema): ...@@ -133,7 +134,7 @@ def main(sections_df, schema):
# Only pass records with data to avoid the hassle of dealing with # Only pass records with data to avoid the hassle of dealing with
# how the NaN rows are written and then read! # how the NaN rows are written and then read!
notna_idx = sections_df[sections_df[section].notna()].index notna_idx = sections_df[sections_df[section].notna()].index
sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False) sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
ssshh = section_buffer.seek(0) ssshh = section_buffer.seek(0)
# Get the individual elements as objects # Get the individual elements as objects
if field_layout == 'fixed_width': if field_layout == 'fixed_width':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment