diff --git a/read.py b/read.py index 0c0e57ed93c6eec19ffeb020a9f047905f438a79..a9167e3815047b2655f45b72bb7e5e4ae4a47960 100644 --- a/read.py +++ b/read.py @@ -79,17 +79,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path): sections_df = get_sections.main(string_df, schema, read_sections_list) - # 2. Read elements from sections: along data chunks, resulting data types - # may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this! + # 2. Read elements from sections + # Along data chunks, resulting data types + # may vary if gaps, keep track of data dtypes: v1.0 + # This has now been solved by working with Intxx pandas dtypes (nullable integers) # Sections are parsed in the same order as sections_df.columns - [data_df, valid_df, out_dtypesi ] = read_sections.main(sections_df, schema) - if i_chunk == 0: - out_dtypes = copy.deepcopy(out_dtypesi) - - for k in out_dtypesi: - if out_dtypesi in properties.numpy_floats: - out_dtypes.update({ k:out_dtypesi.get(k) }) + [data_df, valid_df, out_dtypes ] = read_sections.main(sections_df, schema) # 3. Validate data elements diff --git a/reader/read_sections.py b/reader/read_sections.py index 3bbda03ba6e0bc440760f53fb6729619edabf830..2cdfee9e5ca6a7c1f8f47bf6548cf97900d52748 100644 --- a/reader/read_sections.py +++ b/reader/read_sections.py @@ -68,7 +68,6 @@ def read_data(section_df,section_schema): section_valid = pd.DataFrame(index = section_df.index, columns = section_df.columns) for element in section_dtypes.keys(): - print(element) missing = section_df[element].isna() if element in encoded: section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element]) @@ -141,10 +140,8 @@ def main(sections_df, schema): elements = [ x[1] for x in data_df.columns if x[0] == section ] if multiindex: out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } ) - out_dtypes.update({ (section,i):data_df[(section,i)].dtype.name for i in elements if data_df[(section,i)].dtype.name in properties.numpy_floats}) else: out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } ) - out_dtypes.update({ i:data_df[i].dtype.name for i in section_elements if data_df[i].dtype.name in properties.numpy_floats}) else: if multiindex: out_dtypes.update({ (section,section):'object' } )