diff --git a/read.py b/read.py index 645b9886cd080ac6fc337f789f5440b2c683b4a6..6bcff0c2c99fadeb041454b32a47ba3b793aa6e3 100644 --- a/read.py +++ b/read.py @@ -91,7 +91,9 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): # 4. Save to buffer # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue - data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter) + data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8', + index = False,quoting=csv.QUOTE_NONE, sep=properties.internal_delimiter, + quotechar='\0',escapechar='\0') valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False) # Create the output @@ -112,7 +114,11 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): date_columns.append(i) out_dtypes.update({element:'object'}) - data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter) + data = pd.read_csv(data_buffer,names = data_df.columns, + chunksize = chunksize, dtype = out_dtypes, + parse_dates = date_columns, + delimiter=properties.internal_delimiter, + quotechar='\0',escapechar='\0') valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize) return data, valid @@ -298,8 +304,9 @@ def main(source, data_model = None, data_model_path = None, sections = None,chun else: header = cols out_atts_json = out_atts - data_df.to_csv(os.path.join(out_path,'data.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index') - valid_df.to_csv(os.path.join(out_path,'mask.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index') + kwargs = {'header' : header, 'mode' : mode, 'encoding' : 'utf-8','index' : True, 'index_label' : 'index','quotechar':'\0','escapechar':'\0'} + data_df.to_csv(os.path.join(out_path,'data.csv'), **kwargs) + valid_df.to_csv(os.path.join(out_path,'mask.csv'), **kwargs) if enlisted: data = data[0] valid = valid[0] diff --git a/reader/import_data.py b/reader/import_data.py index f2531925eb97ae16e9682955108132ddeb2aad4a..56b4e370ab36c193719d675b7b700ae3cf5879b5 100644 --- a/reader/import_data.py +++ b/reader/import_data.py @@ -75,7 +75,7 @@ def main(source,chunksize = None, skiprows = None): """ if os.path.isfile(source): - TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize) + TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize, quotechar='\0',escapechar='\0') if not chunksize: TextParser = [TextParser] return TextParser diff --git a/reader/read_sections.py b/reader/read_sections.py index 526ca2c5dae53c67f436de77c0b2c8ed2c450995..63a9fc71a73d4820a631396fc17f6791d3334353 100644 --- a/reader/read_sections.py +++ b/reader/read_sections.py @@ -48,16 +48,21 @@ def extract_fixed_width(section_serie_bf,section_schema): section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)] for i in section_names } - section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False ) + section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, + header = None, names = section_names , + na_values = section_missing, encoding = 'utf-8', + dtype = 'object', skip_blank_lines = False, + quotechar='\0',escapechar='\0') return section_elements def extract_delimited(section_serie_bf,section_schema): delimiter = section_schema['header'].get('delimiter') section_names = section_schema['elements'].keys() section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names } - section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, encoding = 'utf-8', - dtype = 'object', skip_blank_lines = False, - names = section_names, na_values = section_missing) + section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, + encoding = 'utf-8', dtype = 'object', + skip_blank_lines = False, names = section_names, + na_values = section_missing,quotechar='\0',escapechar='\0') return section_elements @@ -134,7 +139,7 @@ def main(sections_df, schema): # Only pass records with data to avoid the hassle of dealing with # how the NaN rows are written and then read! notna_idx = sections_df[sections_df[section].notna()].index - sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter) + sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,quotechar='\0',escapechar='\0',sep=properties.internal_delimiter) ssshh = section_buffer.seek(0) # Get the individual elements as objects if field_layout == 'fixed_width':