From 6b4baf21898aa18c8fda777b6e7e94fa3122d5e7 Mon Sep 17 00:00:00 2001 From: perezgonzalez-irene <iregon@noc.ac.uk> Date: Fri, 28 Feb 2020 09:56:30 +0000 Subject: [PATCH] Added docstrings --- data_models/code_tables.py | 99 +++++++++++++++++++++++++++-------- data_models/schemas.py | 4 +- read.py | 3 +- reader/get_sections.py | 94 +++++++++++++++++++++------------ reader/import_data.py | 103 ++++++++++++++++++++++++------------- reader/read_sections.py | 95 ++++++++++++++++++++++------------ 6 files changed, 270 insertions(+), 128 deletions(-) diff --git a/data_models/code_tables.py b/data_models/code_tables.py index d3df366..9e9f41c 100644 --- a/data_models/code_tables.py +++ b/data_models/code_tables.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Created on Thu Sep 13 15:14:51 2018 + +This module has functions to manage data model +code table files and objects according to the +requirements of the data reader tool + """ import sys @@ -16,12 +20,6 @@ from copy import deepcopy from pandas.io.json.normalize import nested_to_record import ast -if sys.version_info[0] >= 3: - py3 = True -else: - py3 = False - - #https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values #def print_nested(d): # if isinstance(d, dict): @@ -35,15 +33,87 @@ else: # # else: # print(d) + toolPath = os.path.dirname(os.path.abspath(__file__)) table_lib = os.path.join(toolPath,'lib') templates_path = os.path.join(table_lib,'templates','code_tables') + +def read_table(table_path): + """ + + Reads a data model code table file to a dictionary. + It completes the code table to the full complexity + the data reader expects, by appending information + on secondary keys and expanding range keys. + + Arguments + --------- + table_path : str + The file path of the code table. + + Returns + ------- + dict + Code table + + """ + + with open(table_path) as fileObj: + table = json.load(fileObj) + # Add keys for nested code tables + keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys']) + if os.path.isfile(keys_path): + with open(keys_path) as fileObj: + table_keys = json.load(fileObj) + table['_keys'] = {} + for x,y in table_keys.items(): + key = eval_dict_items(x) + values = [ eval_dict_items(k) for k in y ] + table['_keys'][key] = values + # Expand range keys + expand_integer_range_key(table) + + return table + def templates(): + """ + + Lists the name of the available code table templates + + Returns + ------- + list + Code table template aliases + + """ + tables = glob.glob(os.path.join(templates_path,'*.json')) return [ os.path.basename(x).split(".")[0] for x in tables ] def copy_template(table, out_dir = None,out_path = None): + """ + + Copies a code table template to an output + file or path + + Parameters + ---------- + table : str + Code table template name to copy + + Keyword Arguments + ----------------- + out_dir : dict, opt + Directory to copy code table file template to + out_path : dict, opt + Full filename to copy code table file template to + + Either out_dir or out_path must be provided + + + """ + tables = templates() if table in tables: table_path = os.path.join(templates_path,table + '.json') @@ -110,21 +180,6 @@ def eval_dict_items(item): except: return item -def read_table(table_path): - with open(table_path) as fileObj: - table = json.load(fileObj) - keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys']) - if os.path.isfile(keys_path): - with open(keys_path) as fileObj: - table_keys = json.load(fileObj) - table['_keys'] = {} - for x,y in table_keys.items(): - key = eval_dict_items(x) - values = [ eval_dict_items(k) for k in y ] - table['_keys'][key] = values - expand_integer_range_key(table) - return table - def table_keys(table): separator = '∿' # something hopefully not in keys... if table.get('_keys'): diff --git a/data_models/schemas.py b/data_models/schemas.py index cd5fe77..aab0fcd 100644 --- a/data_models/schemas.py +++ b/data_models/schemas.py @@ -209,8 +209,8 @@ def templates(): def copy_template(schema, out_dir = None,out_path = None): """ - Creates a simple attribute dictionary for the elements - in a dataframe from its data model schema + Copies a schema file template to an output + file or path Parameters ---------- diff --git a/read.py b/read.py index a9167e3..67a15ff 100644 --- a/read.py +++ b/read.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ - + Manages the integral sequence in data file reading from a data model: - Access to data model @@ -11,7 +11,6 @@ from a data model: - Output Contains the following functions: - * ERV - does the actual extraction, read and validation of data input data * main - the main function of the script diff --git a/reader/get_sections.py b/reader/get_sections.py index e623475..4baa8be 100644 --- a/reader/get_sections.py +++ b/reader/get_sections.py @@ -1,38 +1,38 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -Created on Tue Apr 30 09:38:17 2019 - -Splits string reports in sections using a data model layout. - -Input and output are simple pandas dataframes, with the output dataframe -column names being the section names - -To work with a pandas TextParser, loop through this module. - -Internally works assuming highest complexity in the input data model: -multiple non sequential sections - -DEV NOTES: - -1) make sure we use Series when working with Series, DataFrames otherwise... -like now: - threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len]) -instead of: - threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len]) - -on data import in import_data.py, we use pd.read_fwf because is more general -use, also support to chunking would make converting to series a bit dirty... - -2) Can we extend (do we need to?) this to reading sequential sections with - no sentinals? apparently (see td11) we are already able to do that: - provided the section is in a sequential parsing_order group - -@author: iregon - -Have to documents the threads approach!!!! - -""" +#""" +#Created on Tue Apr 30 09:38:17 2019 +# +#Splits string reports in sections using a data model layout. +# +#Input and output are simple pandas dataframes, with the output dataframe +#column names being the section names +# +#To work with a pandas TextParser, loop through this module. +# +#Internally works assuming highest complexity in the input data model: +#multiple non sequential sections +# +#DEV NOTES: +# +#1) make sure we use Series when working with Series, DataFrames otherwise... +#like now: +# threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len]) +#instead of: +# threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len]) +# +#on data import in import_data.py, we use pd.read_fwf because is more general +#use, also support to chunking would make converting to series a bit dirty... +# +#2) Can we extend (do we need to?) this to reading sequential sections with +# no sentinals? apparently (see td11) we are already able to do that: +# provided the section is in a sequential parsing_order group +# +#@author: iregon +# +#Have to documents the threads approach!!!! +# +#""" import pandas as pd from copy import deepcopy @@ -200,6 +200,34 @@ def extract_sections(string_df): # MAIN # --------------------------------------------------------------------------- def main(string_df, schema, read_sections): + """ + + Returns a pandas dataframe with a report per row + and the report sections split along the columns. + Each section is a block string and only the sections + listed in read_sections parameter are output. + + Parameters + ---------- + string_df : pandas.DataFrame + Pandas dataframe with a unique column with + the reports as a block string + + schema : dict + Data source data model schema + + read_sections : list + Sections to output from the complete report + + + Returns + ------- + pandas.DataFrame + Dataframe with the report sections split + along the columns. + + + """ global sentinals, section_lens, sentinals_lens global parsing_order # Proceed to split sections if more than one diff --git a/reader/import_data.py b/reader/import_data.py index 0d25197..f253192 100644 --- a/reader/import_data.py +++ b/reader/import_data.py @@ -1,40 +1,40 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -Created on Fri Jan 10 13:17:43 2020 - -FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS: - AN ITERABLE WITH DATAFRAMES - -INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS... - -OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET: - - a single dataframe in a list - - a pd.io.parsers.textfilereader - - -WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE - -delimiter="\t" option in pandas.read_fwf avoids white spaces at tails -to be stripped - -@author: iregon - - - -OPTIONS IN OLD DEVELOPMENT: - 1. DLMT: delimiter = ',' default - names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()] - missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names } - TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8', - dtype = 'object', skip_blank_lines = False, chunksize = chunksize, - skiprows = skiprows, names = names, na_values = missing) - - 2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN - this applies mainly when reading elements from sections, but we leave it also here - TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize) - -""" +#""" +#Created on Fri Jan 10 13:17:43 2020 +# +#FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS: +# AN ITERABLE WITH DATAFRAMES +# +#INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS... +# +#OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET: +# - a single dataframe in a list +# - a pd.io.parsers.textfilereader +# +# +#WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE +# +#delimiter="\t" option in pandas.read_fwf avoids white spaces at tails +#to be stripped +# +#@author: iregon +# +# +# +#OPTIONS IN OLD DEVELOPMENT: +# 1. DLMT: delimiter = ',' default +# names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()] +# missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names } +# TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8', +# dtype = 'object', skip_blank_lines = False, chunksize = chunksize, +# skiprows = skiprows, names = names, na_values = missing) +# +# 2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN +# this applies mainly when reading elements from sections, but we leave it also here +# TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize) +# +#""" import pandas as pd import os @@ -42,7 +42,38 @@ import os from .. import properties def main(source,chunksize = None, skiprows = None): - + """ + + Returns an iterable object with a pandas dataframe from + an input data source. The pandas dataframe has a report + per row and a single column with the full report as a + block string. + Currently only supports a data file path as source data, + but could be easily extended to accept a different + source object. + + Parameters + ---------- + source : str + Path to data file + + Keyword Arguments + ----------------- + chunksize : int, opt + Number of lines to chunk the input data into + skiprows : int, opt + Number of lines to skip from input file + + + Returns + ------- + iterable + List of with a single pandas dataframe + or pandas.io.parsers.textfilereader + + + """ + if os.path.isfile(source): TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize) if not chunksize: diff --git a/reader/read_sections.py b/reader/read_sections.py index 2cdfee9..0fd299c 100644 --- a/reader/read_sections.py +++ b/reader/read_sections.py @@ -1,37 +1,37 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -Created on Fri Jan 10 13:17:43 2020 - -Extracts and reads (decodes, scales, etc...) the elements of data sections. -Each column of the input dataframe is a section with all its elements stored -as a single string. - -Working on a section by section basis, this module uses the data model -information provided in the schema to split the elements, decode and scale them -where appropriate and ensure its data type consistency. - -Output is a dataframe with columns as follows depending on the data model -structure: - 1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)] - 2) Data model with no sections[element0...element1] - - -DEV NOTES: -1) the 'quoted' issue: in version 1.0: - # Writing options from quoting on to prevent supp buoy data to be quoted: - # maybe this happenned because buoy data has commas, and pandas makes its own decission about - # how to write that..... - #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue - # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...) - sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t") - - But we were still experiencing problems when reading fully empty sections, now - we only write to the section buffer reports that are not empty. We afterwards - recover the indexes.... - -@author: iregon -""" +#""" +#Created on Fri Jan 10 13:17:43 2020 +# +#Extracts and reads (decodes, scales, etc...) the elements of data sections. +#Each column of the input dataframe is a section with all its elements stored +#as a single string. +# +#Working on a section by section basis, this module uses the data model +#information provided in the schema to split the elements, decode and scale them +#where appropriate and ensure its data type consistency. +# +#Output is a dataframe with columns as follows depending on the data model +#structure: +# 1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)] +# 2) Data model with no sections[element0...element1] +# +# +#DEV NOTES: +#1) the 'quoted' issue: in version 1.0: +# # Writing options from quoting on to prevent supp buoy data to be quoted: +# # maybe this happenned because buoy data has commas, and pandas makes its own decission about +# # how to write that..... +# #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue +# # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...) +# sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t") +# +# But we were still experiencing problems when reading fully empty sections, now +# we only write to the section buffer reports that are not empty. We afterwards +# recover the indexes.... +# +#@author: iregon +#""" import pandas as pd from io import StringIO as StringIO @@ -80,7 +80,36 @@ def read_data(section_df,section_schema): return section_df,section_valid def main(sections_df, schema): - + """ + + Returns a pandas dataframe with a report per row + and the report sections split along the columns. + Each section is a block string and only the sections + listed in read_sections parameter are output. + + Parameters + ---------- + sections_df : pandas.DataFrame + Pandas dataframe with a column per report sections. + The sections in the columns as a block strings. + schema : dict + Data source data model schema + + Returns + ------- + data : pandas.DataFrame + Dataframe with the report section elements split + along the columns. Multiindex if bla, regular index + if ble + mask : pandas.DataFrame + Dataframe with the report section elements split + along the columns. Multiindex if bla, regular index + if ble + dtypes : dict + Dictionary with pandas data types for each of the + output elements + + """ multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False data_df = pd.DataFrame(index = sections_df.index) valid_df = pd.DataFrame(index = sections_df.index) -- GitLab