Added docstrings

6b4baf21 · iregon · 64acfe65 · 6b4baf21 · 6b4baf21 · 6b4baf21
Commit 6b4baf21 authored 5 years ago by iregon
6 changed files
--- a/data_models/code_tables.py
+++ b/data_models/code_tables.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Created on Thu Sep 13 15:14:51 2018
+
+This module has functions to manage data model
+code table files and objects according to the
+requirements of the data reader tool
+
 """

 import sys
@@ -16,12 +20,6 @@ from copy import deepcopy
 from pandas.io.json.normalize import nested_to_record
 import ast

-if sys.version_info[0] >= 3:
-    py3 = True
-else:
-    py3 = False
-
-
 #https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values
 #def print_nested(d):
 #    if isinstance(d, dict):
@@ -35,15 +33,87 @@ else:
 #
 #    else:
 #        print(d)
+
 toolPath = os.path.dirname(os.path.abspath(__file__))
 table_lib = os.path.join(toolPath,'lib')
 templates_path = os.path.join(table_lib,'templates','code_tables')

+
+def read_table(table_path):
+    """
+
+    Reads a data model code table file to a dictionary.
+    It completes the code table to the full complexity
+    the data reader expects, by appending information
+    on secondary keys and expanding range keys.
+    
+    Arguments
+    ---------
+    table_path : str
+        The file path of the code table.
+
+    Returns
+    -------
+    dict
+        Code table
+
+    """
+    
+    with open(table_path) as fileObj:
+        table = json.load(fileObj)
+    # Add keys for nested code tables    
+    keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys'])
+    if os.path.isfile(keys_path):
+        with open(keys_path) as fileObj:
+            table_keys = json.load(fileObj)
+            table['_keys'] = {}
+            for x,y in table_keys.items():
+                key = eval_dict_items(x)
+                values = [ eval_dict_items(k) for k in y ]
+                table['_keys'][key] = values
+    # Expand range keys            
+    expand_integer_range_key(table)
+    
+    return table
+
 def templates():
+    """
+
+    Lists the name of the available code table templates
+
+    Returns
+    -------
+    list
+        Code table template aliases
+
+    """
+    
    tables = glob.glob(os.path.join(templates_path,'*.json'))
    return [ os.path.basename(x).split(".")[0] for x in tables ]

 def copy_template(table, out_dir = None,out_path = None):
+    """
+
+    Copies a code table template to an output
+    file or path
+    
+    Parameters
+    ----------
+    table : str
+        Code table template name to copy
+        
+    Keyword Arguments
+    -----------------
+    out_dir : dict, opt
+        Directory to copy code table file template to
+    out_path : dict, opt
+        Full filename to copy code table file template to
+    
+    Either out_dir or out_path must be provided
+
+
+    """
+    
    tables = templates()
    if table in tables:
        table_path = os.path.join(templates_path,table + '.json')
@@ -110,21 +180,6 @@ def eval_dict_items(item):
    except:
        return item

-def read_table(table_path):
-    with open(table_path) as fileObj:
-        table = json.load(fileObj)
-    keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys'])
-    if os.path.isfile(keys_path):
-        with open(keys_path) as fileObj:
-            table_keys = json.load(fileObj)
-            table['_keys'] = {}
-            for x,y in table_keys.items():
-                key = eval_dict_items(x)
-                values = [ eval_dict_items(k) for k in y ]
-                table['_keys'][key] = values
-    expand_integer_range_key(table)
-    return table
-
 def table_keys(table):
    separator = '∿' # something hopefully not in keys...
    if table.get('_keys'):

--- a/data_models/schemas.py
+++ b/data_models/schemas.py
@@ -209,8 +209,8 @@ def templates():
 def copy_template(schema, out_dir = None,out_path = None):
    """

-    Creates a simple attribute dictionary for the elements
-    in a dataframe from its data model schema
+    Copies a schema file template to an output
+    file or path
    
    Parameters
    ----------

--- a/read.py
+++ b/read.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
- 
+
 Manages the integral sequence in data file reading
 from a data model:
    - Access to data model
@@ -11,7 +11,6 @@ from a data model:
    - Output

 Contains the following functions:
-
    * ERV - does the actual extraction, read and validation of data input data
    * main - the main function of the script
    

--- a/reader/get_sections.py
+++ b/reader/get_sections.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
-Created on Tue Apr 30 09:38:17 2019
-
-Splits string reports in sections using a data model layout.
-
-Input and output are simple pandas dataframes, with the output dataframe
-column names being the section names
-
-To work with a pandas TextParser, loop through this module.
-
-Internally works assuming highest complexity in the input data model:
-multiple non sequential sections
-
-DEV NOTES:
-    
-1) make sure we use Series when working with Series, DataFrames otherwise...
-like now:
-  threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
-instead of:
-  threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
-
-on data import in import_data.py, we use pd.read_fwf because is more general
-use, also support to chunking would make converting to series a bit dirty...
-
-2) Can we extend (do we need to?) this to reading sequential sections with
-    no sentinals? apparently (see td11) we are already able to do that:
-        provided the section is in a sequential parsing_order group
-
-@author: iregon
-
-Have to documents the threads approach!!!!
-
-"""
+#"""
+#Created on Tue Apr 30 09:38:17 2019
+#
+#Splits string reports in sections using a data model layout.
+#
+#Input and output are simple pandas dataframes, with the output dataframe
+#column names being the section names
+#
+#To work with a pandas TextParser, loop through this module.
+#
+#Internally works assuming highest complexity in the input data model:
+#multiple non sequential sections
+#
+#DEV NOTES:
+#    
+#1) make sure we use Series when working with Series, DataFrames otherwise...
+#like now:
+#  threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
+#instead of:
+#  threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
+#
+#on data import in import_data.py, we use pd.read_fwf because is more general
+#use, also support to chunking would make converting to series a bit dirty...
+#
+#2) Can we extend (do we need to?) this to reading sequential sections with
+#    no sentinals? apparently (see td11) we are already able to do that:
+#        provided the section is in a sequential parsing_order group
+#
+#@author: iregon
+#
+#Have to documents the threads approach!!!!
+#
+#"""

 import pandas as pd
 from copy import deepcopy
@@ -200,6 +200,34 @@ def extract_sections(string_df):
 #   MAIN
 #   ---------------------------------------------------------------------------
 def main(string_df, schema, read_sections):
+    """
+
+    Returns a pandas dataframe with a report per row
+    and the report sections split along the columns.
+    Each section is a block string and only the sections
+    listed in read_sections parameter are output.
+    
+    Parameters
+    ----------
+    string_df : pandas.DataFrame
+        Pandas dataframe with a unique column with
+        the reports as a block string
+        
+    schema : dict 
+        Data source data model schema 
+    
+    read_sections : list 
+        Sections to output from the complete report
+        
+
+    Returns
+    -------
+    pandas.DataFrame 
+        Dataframe with the report sections split 
+        along the columns.
+    
+
+    """
    global sentinals, section_lens, sentinals_lens
    global parsing_order
    # Proceed to split sections if more than one

--- a/reader/import_data.py
+++ b/reader/import_data.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
-Created on Fri Jan 10 13:17:43 2020
-
-FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
-    AN ITERABLE WITH DATAFRAMES
-
-INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
-
-OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
-    - a single dataframe in a list
-    - a pd.io.parsers.textfilereader
-
-
-WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
-
-delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
-to be stripped
-
-@author: iregon
-
-
-
-OPTIONS IN OLD DEVELOPMENT:
-    1. DLMT: delimiter = ',' default
-        names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
-        missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
-        TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
-                                 dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
-                                 skiprows = skiprows, names = names, na_values = missing)
-
-    2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
-    this applies mainly when reading elements from sections, but we leave it also here
-    TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
-
-"""
+#"""
+#Created on Fri Jan 10 13:17:43 2020
+#
+#FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
+#    AN ITERABLE WITH DATAFRAMES
+#
+#INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
+#
+#OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
+#    - a single dataframe in a list
+#    - a pd.io.parsers.textfilereader
+#
+#
+#WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
+#
+#delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
+#to be stripped
+#
+#@author: iregon
+#
+#
+#
+#OPTIONS IN OLD DEVELOPMENT:
+#    1. DLMT: delimiter = ',' default
+#    names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
+#    missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
+#    TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
+#                                 dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
+#                                 skiprows = skiprows, names = names, na_values = missing)
+#
+#    2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
+#    this applies mainly when reading elements from sections, but we leave it also here
+#    TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
+#
+#"""

 import pandas as pd
 import os
@@ -42,7 +42,38 @@ import os
 from .. import properties

 def main(source,chunksize = None, skiprows = None):
-
+    """
+
+    Returns an iterable object with a pandas dataframe from
+    an input data source. The pandas dataframe has a report
+    per row and a single column with the full report as a
+    block string.
+    Currently only supports a data file path as source data,
+    but could be easily extended to accept a different
+    source object.
+    
+    Parameters
+    ----------
+    source : str
+        Path to data file
+        
+    Keyword Arguments
+    -----------------
+    chunksize : int, opt
+        Number of lines to chunk the input data into
+    skiprows : int, opt
+        Number of lines to skip from input file
+
+
+    Returns
+    -------
+    iterable 
+        List of with a single pandas dataframe
+        or pandas.io.parsers.textfilereader
+    
+
+    """
+    
    if os.path.isfile(source):
        TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
        if not chunksize:

--- a/reader/read_sections.py
+++ b/reader/read_sections.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
-Created on Fri Jan 10 13:17:43 2020
-
-Extracts and reads (decodes, scales, etc...) the elements of data sections.
-Each column of the input dataframe is a section with all its elements stored
-as a single string.
-
-Working on a section by section basis, this module uses the data model
-information provided in the schema to split the elements, decode and scale them
-where appropriate and ensure its data type consistency.
-
-Output is a dataframe with columns as follows depending on the data model
-structure:
-    1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
-    2) Data model with no sections[element0...element1]
-
-
-DEV NOTES:
-1) the 'quoted' issue: in version 1.0:
- # Writing options from quoting on to prevent supp buoy data to be quoted:
- # maybe this happenned because buoy data has commas, and pandas makes its own decission about
- # how to write that.....
- #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
- # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
- sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
-
- But we were still experiencing problems when reading fully empty sections, now
- we only write to the section buffer reports that are not empty. We afterwards
- recover the indexes....
-
-@author: iregon
-"""
+#"""
+#Created on Fri Jan 10 13:17:43 2020
+#
+#Extracts and reads (decodes, scales, etc...) the elements of data sections.
+#Each column of the input dataframe is a section with all its elements stored
+#as a single string.
+#
+#Working on a section by section basis, this module uses the data model
+#information provided in the schema to split the elements, decode and scale them
+#where appropriate and ensure its data type consistency.
+#
+#Output is a dataframe with columns as follows depending on the data model
+#structure:
+#    1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
+#    2) Data model with no sections[element0...element1]
+#
+#
+#DEV NOTES:
+#1) the 'quoted' issue: in version 1.0:
+# # Writing options from quoting on to prevent supp buoy data to be quoted:
+# # maybe this happenned because buoy data has commas, and pandas makes its own decission about
+# # how to write that.....
+# #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
+# # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
+# sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
+#
+# But we were still experiencing problems when reading fully empty sections, now
+# we only write to the section buffer reports that are not empty. We afterwards
+# recover the indexes....
+#
+#@author: iregon
+#"""

 import pandas as pd
 from io import StringIO as StringIO
@@ -80,7 +80,36 @@ def read_data(section_df,section_schema):
    return section_df,section_valid

 def main(sections_df, schema):
-
+    """
+
+    Returns a pandas dataframe with a report per row
+    and the report sections split along the columns.
+    Each section is a block string and only the sections
+    listed in read_sections parameter are output.
+    
+    Parameters
+    ----------
+    sections_df : pandas.DataFrame
+        Pandas dataframe with a column per report sections.
+        The sections in the columns as a block strings.    
+    schema : dict 
+        Data source data model schema 
+
+    Returns
+    -------
+    data : pandas.DataFrame 
+        Dataframe with the report section elements split 
+        along the columns. Multiindex if bla, regular index
+        if ble
+    mask : pandas.DataFrame 
+        Dataframe with the report section elements split 
+        along the columns. Multiindex if bla, regular index
+        if ble   
+    dtypes : dict
+        Dictionary with pandas data types for each of the
+        output elements
+        
+    """
    multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False
    data_df = pd.DataFrame(index = sections_df.index)
    valid_df = pd.DataFrame(index = sections_df.index)