Commit 8cf8e0d8 authored by iregon's avatar iregon
Browse files

Quoting on special characters fixed

parent 28ac12af
......@@ -51,3 +51,9 @@ tol = 1E-10
dummy_level = '_SECTION_'
# Length of reports in initial read
MAX_FULL_REPORT_WIDTH = 100000
# This is a delimiter internally used when writing to buffers
# It is the Unicode Character 'END OF TEXT'
# It is supposed to be safe because we don;t expect it in a string
# It's UTF-8 encoding lenght is not > 1, so it is supported by pandas 'c'
# engine, which is faster than the python engine.
internal_delimiter = u"\u0003"
......@@ -26,6 +26,7 @@ import logging
import json
import copy
from io import StringIO as StringIO
import csv
from .data_models import schemas
from . import properties
......@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# - columns(sections) order as in read_sections_list
sections_df = get_sections.main(string_df, schema, read_sections_list)
# 2. Read elements from sections
# Along data chunks, resulting data types
# may vary if gaps, keep track of data dtypes: v1.0
......@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# Sections are parsed in the same order as sections_df.columns
[data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema)
# 3. Validate data elements
valid_df = validate.validate(data_df, valid_df, schema, code_tables_path)
# 4. Save to buffer
data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
# 4. Save to buffer
# Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
# Create the output
......@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
date_columns.append(i)
out_dtypes.update({element:'object'})
data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns)
data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter)
valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)
return data, valid
......
......@@ -35,6 +35,7 @@
import pandas as pd
from io import StringIO as StringIO
import csv
from .. import properties
from ..common.converters import converters
......@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema):
section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
for i in section_names }
section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
return section_elements
def extract_delimited(section_serie_bf,section_schema):
......@@ -71,10 +72,10 @@ def read_data(section_df,section_schema):
missing = section_df[element].isna()
if element in encoded:
section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])
kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element)) }
section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)
section_valid[element] = missing | section_df[element].notna()
return section_df,section_valid
......@@ -133,7 +134,7 @@ def main(sections_df, schema):
# Only pass records with data to avoid the hassle of dealing with
# how the NaN rows are written and then read!
notna_idx = sections_df[sections_df[section].notna()].index
sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False)
sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
ssshh = section_buffer.seek(0)
# Get the individual elements as objects
if field_layout == 'fixed_width':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment