From a507374821a85c82c23b141a77764f359d90f5ce Mon Sep 17 00:00:00 2001 From: perezgonzalez-irene <iregon@noc.ac.uk> Date: Thu, 27 Feb 2020 13:40:11 +0000 Subject: [PATCH] int promotion to float on missing values now solved using pandas nullable integers --- read.py | 14 +++++--------- reader/read_sections.py | 3 --- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/read.py b/read.py index 0c0e57e..a9167e3 100644 --- a/read.py +++ b/read.py @@ -79,17 +79,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): sections_df = get_sections.main(string_df, schema, read_sections_list) - # 2. Read elements from sections: along data chunks, resulting data types - # may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this! + # 2. Read elements from sections + # Along data chunks, resulting data types + # may vary if gaps, keep track of data dtypes: v1.0 + # This has now been solved by working with Intxx pandas dtypes (nullable integers) # Sections are parsed in the same order as sections_df.columns - [data_df, valid_df, out_dtypesi ] = read_sections.main(sections_df, schema) - if i_chunk == 0: - out_dtypes = copy.deepcopy(out_dtypesi) - - for k in out_dtypesi: - if out_dtypesi in properties.numpy_floats: - out_dtypes.update({ k:out_dtypesi.get(k) }) + [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema) # 3. Validate data elements diff --git a/reader/read_sections.py b/reader/read_sections.py index 3bbda03..2cdfee9 100644 --- a/reader/read_sections.py +++ b/reader/read_sections.py @@ -68,7 +68,6 @@ def read_data(section_df,section_schema): section_valid = pd.DataFrame(index = section_df.index, columns = section_df.columns) for element in section_dtypes.keys(): - print(element) missing = section_df[element].isna() if element in encoded: section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element]) @@ -141,10 +140,8 @@ def main(sections_df, schema): elements = [ x[1] for x in data_df.columns if x[0] == section ] if multiindex: out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } ) - out_dtypes.update({ (section,i):data_df[(section,i)].dtype.name for i in elements if data_df[(section,i)].dtype.name in properties.numpy_floats}) else: out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } ) - out_dtypes.update({ i:data_df[i].dtype.name for i in section_elements if data_df[i].dtype.name in properties.numpy_floats}) else: if multiindex: out_dtypes.update({ (section,section):'object' } ) -- GitLab