From 8cf8e0d8fb6eac460b435a3321bc15d60e6616e2 Mon Sep 17 00:00:00 2001 From: perezgonzalez-irene <iregon@noc.ac.uk> Date: Thu, 5 Mar 2020 09:06:11 +0000 Subject: [PATCH] Quoting on special characters fixed --- properties.py | 6 ++++++ read.py | 12 ++++++------ reader/read_sections.py | 7 ++++--- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/properties.py b/properties.py index d9037c2..5911e07 100644 --- a/properties.py +++ b/properties.py @@ -51,3 +51,9 @@ tol = 1E-10 dummy_level = '_SECTION_' # Length of reports in initial read MAX_FULL_REPORT_WIDTH = 100000 +# This is a delimiter internally used when writing to buffers +# It is the Unicode Character 'END OF TEXT' +# It is supposed to be safe because we don;t expect it in a string +# It's UTF-8 encoding lenght is not > 1, so it is supported by pandas 'c' +# engine, which is faster than the python engine. +internal_delimiter = u"\u0003" diff --git a/read.py b/read.py index 55de02d..645b988 100644 --- a/read.py +++ b/read.py @@ -26,6 +26,7 @@ import logging import json import copy from io import StringIO as StringIO +import csv from .data_models import schemas from . import properties @@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): # - columns(sections) order as in read_sections_list sections_df = get_sections.main(string_df, schema, read_sections_list) - # 2. Read elements from sections # Along data chunks, resulting data types # may vary if gaps, keep track of data dtypes: v1.0 @@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): # Sections are parsed in the same order as sections_df.columns [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema) - # 3. Validate data elements valid_df = validate.validate(data_df, valid_df, schema, code_tables_path) - - # 4. Save to buffer - data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False) + # 4. Save to buffer + # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted + #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue + data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter) valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False) # Create the output @@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): date_columns.append(i) out_dtypes.update({element:'object'}) - data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns) + data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter) valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize) return data, valid diff --git a/reader/read_sections.py b/reader/read_sections.py index 6baed2b..526ca2c 100644 --- a/reader/read_sections.py +++ b/reader/read_sections.py @@ -35,6 +35,7 @@ import pandas as pd from io import StringIO as StringIO +import csv from .. import properties from ..common.converters import converters @@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema): section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)] for i in section_names } - section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False ) + section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False ) return section_elements def extract_delimited(section_serie_bf,section_schema): @@ -71,10 +72,10 @@ def read_data(section_df,section_schema): missing = section_df[element].isna() if element in encoded: section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element]) - kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element)) } section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs) + section_valid[element] = missing | section_df[element].notna() return section_df,section_valid @@ -133,7 +134,7 @@ def main(sections_df, schema): # Only pass records with data to avoid the hassle of dealing with # how the NaN rows are written and then read! notna_idx = sections_df[sections_df[section].notna()].index - sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False) + sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter) ssshh = section_buffer.seek(0) # Get the individual elements as objects if field_layout == 'fixed_width': -- GitLab