int promotion to float on missing values now solved using pandas nullable integers

a5073748 · iregon · e2ecea16 · a5073748 · a5073748
Commit a5073748 authored 5 years ago by iregon
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 12 deletions

read.py read.py +5 -9

reader/read_sections.py reader/read_sections.py +0 -3

No files found.
--- a/read.py
+++ b/read.py
@@ -79,17 +79,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
        
        sections_df = get_sections.main(string_df, schema, read_sections_list)

-        # 2. Read elements from sections: along data chunks, resulting data types
-        # may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this!
+        # 2. Read elements from sections
+        # Along data chunks, resulting data types
+        # may vary if gaps, keep track of data dtypes: v1.0
+        # This has now been solved by working with Intxx pandas dtypes (nullable integers) 
        # Sections are parsed in the same order as sections_df.columns
        
-        [data_df, valid_df, out_dtypesi ] = read_sections.main(sections_df, schema)
-        if i_chunk == 0:
-            out_dtypes = copy.deepcopy(out_dtypesi)
-
-        for k in out_dtypesi:
-            if out_dtypesi in properties.numpy_floats:
-                out_dtypes.update({ k:out_dtypesi.get(k) })
+        [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema)
        
        # 3. Validate data elements
        

--- a/reader/read_sections.py
+++ b/reader/read_sections.py
@@ -68,7 +68,6 @@ def read_data(section_df,section_schema):
    section_valid = pd.DataFrame(index = section_df.index, columns = section_df.columns)

    for element in section_dtypes.keys():
-        print(element)
        missing = section_df[element].isna()
        if element in encoded:
            section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])
@@ -141,10 +140,8 @@ def main(sections_df, schema):
            elements = [ x[1] for x in data_df.columns if x[0] == section ]
            if multiindex:
                out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
-                out_dtypes.update({ (section,i):data_df[(section,i)].dtype.name for i in elements if data_df[(section,i)].dtype.name in properties.numpy_floats})
            else:
                out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
-                out_dtypes.update({ i:data_df[i].dtype.name for i in section_elements if data_df[i].dtype.name in properties.numpy_floats})
        else:
            if multiindex:
                    out_dtypes.update({ (section,section):'object' } )