From 8cf8e0d8fb6eac460b435a3321bc15d60e6616e2 Mon Sep 17 00:00:00 2001
From: perezgonzalez-irene <iregon@noc.ac.uk>
Date: Thu, 5 Mar 2020 09:06:11 +0000
Subject: [PATCH] Quoting on special characters fixed

---
 properties.py           |  6 ++++++
 read.py                 | 12 ++++++------
 reader/read_sections.py |  7 ++++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/properties.py b/properties.py
index d9037c2..5911e07 100644
--- a/properties.py
+++ b/properties.py
@@ -51,3 +51,9 @@ tol = 1E-10
 dummy_level = '_SECTION_'
 # Length of reports in initial read
 MAX_FULL_REPORT_WIDTH = 100000
+# This is a delimiter internally used when writing to buffers
+# It is the Unicode Character 'END OF TEXT' 
+# It is supposed to be safe because we don;t expect it in a string 
+# It's UTF-8 encoding lenght is not > 1, so it is supported by pandas 'c' 
+# engine, which is faster than the python engine.
+internal_delimiter = u"\u0003"
diff --git a/read.py b/read.py
index 55de02d..645b988 100644
--- a/read.py
+++ b/read.py
@@ -26,6 +26,7 @@ import logging
 import json
 import copy
 from io import StringIO as StringIO
+import csv
 
 from .data_models import schemas
 from . import properties
@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
         # - columns(sections) order as in read_sections_list
         
         sections_df = get_sections.main(string_df, schema, read_sections_list)
-
         # 2. Read elements from sections
         # Along data chunks, resulting data types
         # may vary if gaps, keep track of data dtypes: v1.0
@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
         # Sections are parsed in the same order as sections_df.columns
         
         [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema)
-
         # 3. Validate data elements
         
         valid_df = validate.validate(data_df, valid_df, schema, code_tables_path)
-        
-        # 4. Save to buffer
-        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
+        # 4. Save to buffer        
+        # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
+        #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
+        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
         valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
         
     # Create the output
@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
             date_columns.append(i)
             out_dtypes.update({element:'object'})
 
-    data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns)
+    data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter)
     valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)
 
     return data, valid
diff --git a/reader/read_sections.py b/reader/read_sections.py
index 6baed2b..526ca2c 100644
--- a/reader/read_sections.py
+++ b/reader/read_sections.py
@@ -35,6 +35,7 @@
 
 import pandas as pd
 from io import StringIO as StringIO
+import csv
 
 from .. import properties
 from ..common.converters import converters
@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema):
     section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
                                else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
                                for i in section_names }
-    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
+    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
     return section_elements
 
 def extract_delimited(section_serie_bf,section_schema):
@@ -71,10 +72,10 @@ def read_data(section_df,section_schema):
         missing = section_df[element].isna()
         if element in encoded:
             section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])
-
         kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element))  }
         section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)
         
+
         section_valid[element] = missing | section_df[element].notna()
     return section_df,section_valid
 
@@ -133,7 +134,7 @@ def main(sections_df, schema):
             # Only pass records with data to avoid the hassle of dealing with
             # how the NaN rows are written and then read!
             notna_idx = sections_df[sections_df[section].notna()].index
-            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False)
+            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
             ssshh = section_buffer.seek(0)
             # Get the individual elements as objects
             if field_layout == 'fixed_width':
-- 
GitLab