Fixes for quotes and escape chars

5e1c7d4a · iregon · ebdc6bf1 · 5e1c7d4a · 5e1c7d4a · 5e1c7d4a
Commit 5e1c7d4a authored 4 years ago by iregon
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 10 deletions

read.py read.py +11 -4

reader/import_data.py reader/import_data.py +1 -1

reader/read_sections.py reader/read_sections.py +10 -5

No files found.
--- a/read.py
+++ b/read.py
@@ -91,7 +91,9 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
        # 4. Save to buffer        
        # Writing options from quoting on to prevent data with special characters, like commas, etc, to be quoted
        #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
-        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
+        data_df.to_csv(data_buffer,header = False, mode = 'a', encoding = 'utf-8',
+                       index = False,quoting=csv.QUOTE_NONE, sep=properties.internal_delimiter,
+                       quotechar='\0',escapechar='\0')
        valid_df.to_csv(valid_buffer,header = False, mode = 'a', encoding = 'utf-8',index = False)
        
    # Create the output
@@ -112,7 +114,11 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
            date_columns.append(i)
            out_dtypes.update({element:'object'})

-    data = pd.read_csv(data_buffer,names = data_df.columns, chunksize = chunksize, dtype = out_dtypes, parse_dates = date_columns,delimiter=properties.internal_delimiter)
+    data = pd.read_csv(data_buffer,names = data_df.columns, 
+                       chunksize = chunksize, dtype = out_dtypes, 
+                       parse_dates = date_columns,
+                       delimiter=properties.internal_delimiter,
+                       quotechar='\0',escapechar='\0')
    valid = pd.read_csv(valid_buffer,names = data_df.columns, chunksize = chunksize)

    return data, valid
@@ -298,8 +304,9 @@ def main(source, data_model = None, data_model_path = None, sections = None,chun
                else:
                    header = cols
                    out_atts_json = out_atts
-            data_df.to_csv(os.path.join(out_path,'data.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
-            valid_df.to_csv(os.path.join(out_path,'mask.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index')
+            kwargs = {'header' : header, 'mode' : mode, 'encoding' : 'utf-8','index' : True, 'index_label' : 'index','quotechar':'\0','escapechar':'\0'}
+            data_df.to_csv(os.path.join(out_path,'data.csv'), **kwargs)
+            valid_df.to_csv(os.path.join(out_path,'mask.csv'), **kwargs)
        if enlisted:
            data = data[0]
            valid = valid[0]

--- a/reader/import_data.py
+++ b/reader/import_data.py
@@ -75,7 +75,7 @@ def main(source,chunksize = None, skiprows = None):
    """
    
    if os.path.isfile(source):
-        TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
+        TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize, quotechar='\0',escapechar='\0')
        if not chunksize:
            TextParser = [TextParser]
        return TextParser

--- a/reader/read_sections.py
+++ b/reader/read_sections.py
@@ -48,16 +48,21 @@ def extract_fixed_width(section_serie_bf,section_schema):
    section_missing = { i:section_schema['elements'][i].get('missing_value') if section_schema['elements'][i].get('disable_white_strip') == True
                               else [section_schema['elements'][i].get('missing_value')," "*section_schema['elements'][i].get('field_length', properties.MAX_FULL_REPORT_WIDTH)]
                               for i in section_names }
-    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
+    section_elements = pd.read_fwf(section_serie_bf, widths = section_widths,
+                                   header = None, names = section_names , 
+                                   na_values = section_missing, encoding = 'utf-8', 
+                                   dtype = 'object', skip_blank_lines = False,
+                                   quotechar='\0',escapechar='\0')
    return section_elements

 def extract_delimited(section_serie_bf,section_schema):
    delimiter = section_schema['header'].get('delimiter')
    section_names = section_schema['elements'].keys()
    section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names }
-    section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, encoding = 'utf-8',
-                                 dtype = 'object', skip_blank_lines = False,
-                                 names = section_names, na_values = section_missing)
+    section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, 
+                                   encoding = 'utf-8', dtype = 'object', 
+                                   skip_blank_lines = False, names = section_names, 
+                                   na_values = section_missing,quotechar='\0',escapechar='\0')

    return section_elements

@@ -134,7 +139,7 @@ def main(sections_df, schema):
            # Only pass records with data to avoid the hassle of dealing with
            # how the NaN rows are written and then read!
            notna_idx = sections_df[sections_df[section].notna()].index
-            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar='\\',sep=properties.internal_delimiter)
+            sections_df[section].loc[notna_idx].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,quotechar='\0',escapechar='\0',sep=properties.internal_delimiter)
            ssshh = section_buffer.seek(0)
            # Get the individual elements as objects
            if field_layout == 'fixed_width':