Quoting on special characters fixed

8cf8e0d8 · iregon · 28ac12af · 8cf8e0d8 · 8cf8e0d8 · 8cf8e0d8
Commit 8cf8e0d8 authored 5 years ago by iregon
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 9 deletions

properties.py properties.py +6 -0

read.py read.py +6 -6

reader/read_sections.py reader/read_sections.py +4 -3

No files found.
--- a/properties.py
+++ b/properties.py
@@ -51,3 +51,9 @@ tol = 1E-10
 dummy_level = '_SECTION_'
 # Length of reports in initial read
 MAX_FULL_REPORT_WIDTH = 100000
+# This is a delimiter internally used when writing to buffers
+# It is the Unicode Character 'END OF TEXT' 
+# It is supposed to be safe because we don;t expect it in a string 
+# It's UTF-8 encoding lenght is not > 1, so it is supported by pandas 'c' 
+# engine, which is faster than the python engine.
+internal_delimiter = u"\u0003"
--- a/read.py
+++ b/read.py
@@ -26,6 +26,7 @@ import logging
 import json
 import copy
 from io import StringIO as StringIO
+import csv

 from .data_models import schemas
 from . import properties
@@ -77,7 +78,6 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
        # - columns(sections) order as in read_sections_list
        
        sections_df = get_sections.main(string_df, schema, read_sections_list)
-
        # 2. Read elements from sections
        # Along data chunks, resulting data types
        # may vary if gaps, keep track of data dtypes: v1.0
@@ -85,13 +85,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
        # Sections are parsed in the same order as sections_df.columns
        
        [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema)
-
        # 3. Validate data elements
        
        valid_df = validate.validate(data_df, valid_df, schema, code_tables_path)
-        
-        # 4. Save to buffer
-        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
+        # 4. Save to buffer        
+        # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
+        #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
+        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
        valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
        
    # Create the output
@@ -112,7 +112,7 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
            date_columns.append(i)
            out_dtypes.update({element:'object'})

-    data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns)
+    data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter)
    valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)

    return data, valid

--- a/reader/read_sections.py
+++ b/reader/read_sections.py
@@ -35,6 +35,7 @@

 import pandas as pd
 from io import StringIO as StringIO
+import csv

 from .. import properties
 from ..common.converters import converters
@@ -47,7 +48,7 @@ def extract_fixed_width(section_serie_bf,section_schema):
    section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
                               else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
                               for i in section_names }
-    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
+    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
    return section_elements

 def extract_delimited(section_serie_bf,section_schema):
@@ -71,10 +72,10 @@ def read_data(section_df,section_schema):
        missing = section_df[element].isna()
        if element in encoded:
            section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])
-
        kwargs = { converter_arg:section_schema['elements'][element].get(converter_arg) for converter_arg in properties.data_type_conversion_args.get(section_dtypes.get(element))  }
        section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)
        
+
        section_valid[element] = missing | section_df[element].notna()
    return section_df,section_valid

@@ -133,7 +134,7 @@ def main(sections_df, schema):
            # Only pass records with data to avoid the hassle of dealing with
            # how the NaN rows are written and then read!
            notna_idx = sections_df[sections_df[section].notna()].index
-            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False)
+            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
            ssshh = section_buffer.seek(0)
            # Get the individual elements as objects
            if field_layout == 'fixed_width':