From d4f355e43228c7e45075f7108b1e1553e763b40b Mon Sep 17 00:00:00 2001 From: perezgonzalez-irene <iregon@noc.ac.uk> Date: Wed, 22 Jan 2020 09:39:40 +0000 Subject: [PATCH] Fixed chunking --- read.py | 33 +++++++++++++++++++++++++-------- reader/import_data.py | 4 ++-- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/read.py b/read.py index 0af38af..01bc9c4 100644 --- a/read.py +++ b/read.py @@ -85,16 +85,33 @@ def read(source, data_model = None, data_model_path = None, sections = None,chun # 6. Output to files if requested if out_path: + enlisted = False + if not isinstance(data,pd.io.parsers.TextFileReader): + data = [data] + valid = [valid] + enlisted = True logging.info('WRITING DATA TO FILES IN: {}'.format(out_path)) - cols = [ x for x in data ] - if isinstance(cols[0],tuple): - header = [":".join(x) for x in cols] - out_atts_json = { ":".join(x):out_atts.get(x) for x in out_atts.keys() } + + for i, (data_df,valid_df) in enumerate(zip(data,valid)): + header = False + mode = 'a' + if i == 0: + mode = 'w' + cols = [ x for x in data_df ] + if isinstance(cols[0],tuple): + header = [":".join(x) for x in cols] + out_atts_json = { ":".join(x):out_atts.get(x) for x in out_atts.keys() } + else: + header = cols + out_atts_json = out_atts + data_df.to_csv(os.path.join(out_path,'data.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index') + valid_df.to_csv(os.path.join(out_path,'valid_mask.csv'), header = header, mode = mode, encoding = 'utf-8',index = True, index_label='index') + if enlisted: + data = data[0] + valid = valid[0] else: - header = cols - out_atts_json = out_atts - data.to_csv(os.path.join(out_path,'data.csv'), header = header, encoding = 'utf-8',index = True, index_label='index') - valid.to_csv(os.path.join(out_path,'valid_mask.csv'), header = header, encoding = 'utf-8',index = True, index_label='index') + data = pandas_TextParser_hdlr.restore(data.f,data.orig_options) + valid = pandas_TextParser_hdlr.restore(valid.f,valid.orig_options) with open(os.path.join(out_path,'atts.json'),'w') as fileObj: json.dump(out_atts_json,fileObj,indent=4) diff --git a/reader/import_data.py b/reader/import_data.py index 26b0a98..a0a3e3a 100644 --- a/reader/import_data.py +++ b/reader/import_data.py @@ -61,10 +61,10 @@ def import_data(source,chunksize = None, skiprows = None): if isinstance(source,pd.io.parsers.TextFileReader): return source elif isinstance(source, io.StringIO): - TextParser = to_iterable_df(source,skiprows = None, chunksize = None) + TextParser = to_iterable_df(source,skiprows = None, chunksize = chunksize) return TextParser elif os.path.isfile(source): - TextParser = to_iterable_df(source,skiprows = None, chunksize = None) + TextParser = to_iterable_df(source,skiprows = None, chunksize = chunksize) return TextParser else: print('Error') -- GitLab