Commit 5e1c7d4a authored by iregon's avatar iregon
Browse files

Fixes for quotes and escape chars

parent ebdc6bf1
......@@ -91,7 +91,9 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
# 4. Save to buffer
# Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',
index = False,quoting=csv.QUOTE_NONE, sep=properties.internal_delimiter,
quotechar='\0',escapechar='\0')
valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
# Create the output
......@@ -112,7 +114,11 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
date_columns.append(i)
out_dtypes.update({element:'object'})
data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter)
data = pd.read_csv(data_buffer,names = data_df.columns,
chunksize = chunksize, dtype = out_dtypes,
parse_dates = date_columns,
delimiter=properties.internal_delimiter,
quotechar='\0',escapechar='\0')
valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)
return data, valid
......@@ -298,8 +304,9 @@ def main(source, data_model = None, data_model_path = None, sections = None,chun
else:
header = cols
out_atts_json = out_atts
data_df.to_csv(os.path.join(out_path,'data.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
valid_df.to_csv(os.path.join(out_path,'mask.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
kwargs = {'header' : header, 'mode' : mode, 'encoding' : 'utf-8','index' : True, 'index_label' : 'index','quotechar':'\0','escapechar':'\0'}
data_df.to_csv(os.path.join(out_path,'data.csv'), **kwargs)
valid_df.to_csv(os.path.join(out_path,'mask.csv'), **kwargs)
if enlisted:
data = data[0]
valid = valid[0]
......
......@@ -75,7 +75,7 @@ def main(source,chunksize = None, skiprows = None):
"""
if os.path.isfile(source):
TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize, quotechar='\0',escapechar='\0')
if not chunksize:
TextParser = [TextParser]
return TextParser
......
......@@ -48,16 +48,21 @@ def extract_fixed_width(section_serie_bf,section_schema):
section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
for i in section_names }
section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
section_elements = pd.read_fwf(section_serie_bf, widths = section_widths,
header = None, names = section_names ,
na_values = section_missing, encoding = 'utf-8',
dtype = 'object', skip_blank_lines = False,
quotechar='\0',escapechar='\0')
return section_elements
def extract_delimited(section_serie_bf,section_schema):
delimiter = section_schema['header'].get('delimiter')
section_names = section_schema['elements'].keys()
section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names }
section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, encoding = 'utf-8',
dtype = 'object', skip_blank_lines = False,
names = section_names, na_values = section_missing)
section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter,
encoding = 'utf-8', dtype = 'object',
skip_blank_lines = False, names = section_names,
na_values = section_missing,quotechar='\0',escapechar='\0')
return section_elements
......@@ -134,7 +139,7 @@ def main(sections_df, schema):
# Only pass records with data to avoid the hassle of dealing with
# how the NaN rows are written and then read!
notna_idx = sections_df[sections_df[section].notna()].index
sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,quotechar='\0',escapechar='\0',sep=properties.internal_delimiter)
ssshh = section_buffer.seek(0)
# Get the individual elements as objects
if field_layout == 'fixed_width':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment