Commit 6b4baf21 authored by iregon's avatar iregon
Browse files

Added docstrings

parent 64acfe65
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Created on Thu Sep 13 15:14:51 2018
This module has functions to manage data model
code table files and objects according to the
requirements of the data reader tool
""" """
import sys import sys
...@@ -16,12 +20,6 @@ from copy import deepcopy ...@@ -16,12 +20,6 @@ from copy import deepcopy
from pandas.io.json.normalize import nested_to_record from pandas.io.json.normalize import nested_to_record
import ast import ast
if sys.version_info[0] >= 3:
py3 = True
else:
py3 = False
#https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values #https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values
#def print_nested(d): #def print_nested(d):
# if isinstance(d, dict): # if isinstance(d, dict):
...@@ -35,15 +33,87 @@ else: ...@@ -35,15 +33,87 @@ else:
# #
# else: # else:
# print(d) # print(d)
toolPath = os.path.dirname(os.path.abspath(__file__)) toolPath = os.path.dirname(os.path.abspath(__file__))
table_lib = os.path.join(toolPath,'lib') table_lib = os.path.join(toolPath,'lib')
templates_path = os.path.join(table_lib,'templates','code_tables') templates_path = os.path.join(table_lib,'templates','code_tables')
def read_table(table_path):
"""
Reads a data model code table file to a dictionary.
It completes the code table to the full complexity
the data reader expects, by appending information
on secondary keys and expanding range keys.
Arguments
---------
table_path : str
The file path of the code table.
Returns
-------
dict
Code table
"""
with open(table_path) as fileObj:
table = json.load(fileObj)
# Add keys for nested code tables
keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys'])
if os.path.isfile(keys_path):
with open(keys_path) as fileObj:
table_keys = json.load(fileObj)
table['_keys'] = {}
for x,y in table_keys.items():
key = eval_dict_items(x)
values = [ eval_dict_items(k) for k in y ]
table['_keys'][key] = values
# Expand range keys
expand_integer_range_key(table)
return table
def templates(): def templates():
"""
Lists the name of the available code table templates
Returns
-------
list
Code table template aliases
"""
tables = glob.glob(os.path.join(templates_path,'*.json')) tables = glob.glob(os.path.join(templates_path,'*.json'))
return [ os.path.basename(x).split(".")[0] for x in tables ] return [ os.path.basename(x).split(".")[0] for x in tables ]
def copy_template(table, out_dir = None,out_path = None): def copy_template(table, out_dir = None,out_path = None):
"""
Copies a code table template to an output
file or path
Parameters
----------
table : str
Code table template name to copy
Keyword Arguments
-----------------
out_dir : dict, opt
Directory to copy code table file template to
out_path : dict, opt
Full filename to copy code table file template to
Either out_dir or out_path must be provided
"""
tables = templates() tables = templates()
if table in tables: if table in tables:
table_path = os.path.join(templates_path,table + '.json') table_path = os.path.join(templates_path,table + '.json')
...@@ -110,21 +180,6 @@ def eval_dict_items(item): ...@@ -110,21 +180,6 @@ def eval_dict_items(item):
except: except:
return item return item
def read_table(table_path):
with open(table_path) as fileObj:
table = json.load(fileObj)
keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys'])
if os.path.isfile(keys_path):
with open(keys_path) as fileObj:
table_keys = json.load(fileObj)
table['_keys'] = {}
for x,y in table_keys.items():
key = eval_dict_items(x)
values = [ eval_dict_items(k) for k in y ]
table['_keys'][key] = values
expand_integer_range_key(table)
return table
def table_keys(table): def table_keys(table):
separator = '∿' # something hopefully not in keys... separator = '∿' # something hopefully not in keys...
if table.get('_keys'): if table.get('_keys'):
......
...@@ -209,8 +209,8 @@ def templates(): ...@@ -209,8 +209,8 @@ def templates():
def copy_template(schema, out_dir = None,out_path = None): def copy_template(schema, out_dir = None,out_path = None):
""" """
Creates a simple attribute dictionary for the elements Copies a schema file template to an output
in a dataframe from its data model schema file or path
Parameters Parameters
---------- ----------
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Manages the integral sequence in data file reading Manages the integral sequence in data file reading
from a data model: from a data model:
- Access to data model - Access to data model
...@@ -11,7 +11,6 @@ from a data model: ...@@ -11,7 +11,6 @@ from a data model:
- Output - Output
Contains the following functions: Contains the following functions:
* ERV - does the actual extraction, read and validation of data input data * ERV - does the actual extraction, read and validation of data input data
* main - the main function of the script * main - the main function of the script
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" #"""
Created on Tue Apr 30 09:38:17 2019 #Created on Tue Apr 30 09:38:17 2019
#
Splits string reports in sections using a data model layout. #Splits string reports in sections using a data model layout.
#
Input and output are simple pandas dataframes, with the output dataframe #Input and output are simple pandas dataframes, with the output dataframe
column names being the section names #column names being the section names
#
To work with a pandas TextParser, loop through this module. #To work with a pandas TextParser, loop through this module.
#
Internally works assuming highest complexity in the input data model: #Internally works assuming highest complexity in the input data model:
multiple non sequential sections #multiple non sequential sections
#
DEV NOTES: #DEV NOTES:
#
1) make sure we use Series when working with Series, DataFrames otherwise... #1) make sure we use Series when working with Series, DataFrames otherwise...
like now: #like now:
threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len]) # threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
instead of: #instead of:
threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len]) # threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
#
on data import in import_data.py, we use pd.read_fwf because is more general #on data import in import_data.py, we use pd.read_fwf because is more general
use, also support to chunking would make converting to series a bit dirty... #use, also support to chunking would make converting to series a bit dirty...
#
2) Can we extend (do we need to?) this to reading sequential sections with #2) Can we extend (do we need to?) this to reading sequential sections with
no sentinals? apparently (see td11) we are already able to do that: # no sentinals? apparently (see td11) we are already able to do that:
provided the section is in a sequential parsing_order group # provided the section is in a sequential parsing_order group
#
@author: iregon #@author: iregon
#
Have to documents the threads approach!!!! #Have to documents the threads approach!!!!
#
""" #"""
import pandas as pd import pandas as pd
from copy import deepcopy from copy import deepcopy
...@@ -200,6 +200,34 @@ def extract_sections(string_df): ...@@ -200,6 +200,34 @@ def extract_sections(string_df):
# MAIN # MAIN
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def main(string_df, schema, read_sections): def main(string_df, schema, read_sections):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
string_df : pandas.DataFrame
Pandas dataframe with a unique column with
the reports as a block string
schema : dict
Data source data model schema
read_sections : list
Sections to output from the complete report
Returns
-------
pandas.DataFrame
Dataframe with the report sections split
along the columns.
"""
global sentinals, section_lens, sentinals_lens global sentinals, section_lens, sentinals_lens
global parsing_order global parsing_order
# Proceed to split sections if more than one # Proceed to split sections if more than one
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" #"""
Created on Fri Jan 10 13:17:43 2020 #Created on Fri Jan 10 13:17:43 2020
#
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS: #FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
AN ITERABLE WITH DATAFRAMES # AN ITERABLE WITH DATAFRAMES
#
INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS... #INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
#
OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET: #OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
- a single dataframe in a list # - a single dataframe in a list
- a pd.io.parsers.textfilereader # - a pd.io.parsers.textfilereader
#
#
WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE #WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
#
delimiter="\t" option in pandas.read_fwf avoids white spaces at tails #delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
to be stripped #to be stripped
#
@author: iregon #@author: iregon
#
#
#
OPTIONS IN OLD DEVELOPMENT: #OPTIONS IN OLD DEVELOPMENT:
1. DLMT: delimiter = ',' default # 1. DLMT: delimiter = ',' default
names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()] # names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names } # missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8', # TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
dtype = 'object', skip_blank_lines = False, chunksize = chunksize, # dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
skiprows = skiprows, names = names, na_values = missing) # skiprows = skiprows, names = names, na_values = missing)
#
2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN # 2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
this applies mainly when reading elements from sections, but we leave it also here # this applies mainly when reading elements from sections, but we leave it also here
TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize) # TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
#
""" #"""
import pandas as pd import pandas as pd
import os import os
...@@ -42,7 +42,38 @@ import os ...@@ -42,7 +42,38 @@ import os
from .. import properties from .. import properties
def main(source,chunksize = None, skiprows = None): def main(source,chunksize = None, skiprows = None):
"""
Returns an iterable object with a pandas dataframe from
an input data source. The pandas dataframe has a report
per row and a single column with the full report as a
block string.
Currently only supports a data file path as source data,
but could be easily extended to accept a different
source object.
Parameters
----------
source : str
Path to data file
Keyword Arguments
-----------------
chunksize : int, opt
Number of lines to chunk the input data into
skiprows : int, opt
Number of lines to skip from input file
Returns
-------
iterable
List of with a single pandas dataframe
or pandas.io.parsers.textfilereader
"""
if os.path.isfile(source): if os.path.isfile(source):
TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize) TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
if not chunksize: if not chunksize:
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" #"""
Created on Fri Jan 10 13:17:43 2020 #Created on Fri Jan 10 13:17:43 2020
#
Extracts and reads (decodes, scales, etc...) the elements of data sections. #Extracts and reads (decodes, scales, etc...) the elements of data sections.
Each column of the input dataframe is a section with all its elements stored #Each column of the input dataframe is a section with all its elements stored
as a single string. #as a single string.
#
Working on a section by section basis, this module uses the data model #Working on a section by section basis, this module uses the data model
information provided in the schema to split the elements, decode and scale them #information provided in the schema to split the elements, decode and scale them
where appropriate and ensure its data type consistency. #where appropriate and ensure its data type consistency.
#
Output is a dataframe with columns as follows depending on the data model #Output is a dataframe with columns as follows depending on the data model
structure: #structure:
1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)] # 1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
2) Data model with no sections[element0...element1] # 2) Data model with no sections[element0...element1]
#
#
DEV NOTES: #DEV NOTES:
1) the 'quoted' issue: in version 1.0: #1) the 'quoted' issue: in version 1.0:
# Writing options from quoting on to prevent supp buoy data to be quoted: # # Writing options from quoting on to prevent supp buoy data to be quoted:
# maybe this happenned because buoy data has commas, and pandas makes its own decission about # # maybe this happenned because buoy data has commas, and pandas makes its own decission about
# how to write that..... # # how to write that.....
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue # #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
# quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...) # # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t") # sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
#
But we were still experiencing problems when reading fully empty sections, now # But we were still experiencing problems when reading fully empty sections, now
we only write to the section buffer reports that are not empty. We afterwards # we only write to the section buffer reports that are not empty. We afterwards
recover the indexes.... # recover the indexes....
#
@author: iregon #@author: iregon
""" #"""
import pandas as pd import pandas as pd
from io import StringIO as StringIO from io import StringIO as StringIO
...@@ -80,7 +80,36 @@ def read_data(section_df,section_schema): ...@@ -80,7 +80,36 @@ def read_data(section_df,section_schema):
return section_df,section_valid return section_df,section_valid
def main(sections_df, schema): def main(sections_df, schema):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
sections_df : pandas.DataFrame
Pandas dataframe with a column per report sections.
The sections in the columns as a block strings.
schema : dict
Data source data model schema
Returns
-------
data : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
mask : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
dtypes : dict
Dictionary with pandas data types for each of the
output elements
"""
multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False
data_df = pd.DataFrame(index = sections_df.index) data_df = pd.DataFrame(index = sections_df.index)
valid_df = pd.DataFrame(index = sections_df.index) valid_df = pd.DataFrame(index = sections_df.index)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment