Commit 6b4baf21 authored by iregon's avatar iregon
Browse files

Added docstrings

parent 64acfe65
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 13 15:14:51 2018
This module has functions to manage data model
code table files and objects according to the
requirements of the data reader tool
"""
import sys
......@@ -16,12 +20,6 @@ from copy import deepcopy
from pandas.io.json.normalize import nested_to_record
import ast
if sys.version_info[0] >= 3:
py3 = True
else:
py3 = False
#https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values
#def print_nested(d):
# if isinstance(d, dict):
......@@ -35,15 +33,87 @@ else:
#
# else:
# print(d)
toolPath = os.path.dirname(os.path.abspath(__file__))
table_lib = os.path.join(toolPath,'lib')
templates_path = os.path.join(table_lib,'templates','code_tables')
def read_table(table_path):
"""
Reads a data model code table file to a dictionary.
It completes the code table to the full complexity
the data reader expects, by appending information
on secondary keys and expanding range keys.
Arguments
---------
table_path : str
The file path of the code table.
Returns
-------
dict
Code table
"""
with open(table_path) as fileObj:
table = json.load(fileObj)
# Add keys for nested code tables
keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys'])
if os.path.isfile(keys_path):
with open(keys_path) as fileObj:
table_keys = json.load(fileObj)
table['_keys'] = {}
for x,y in table_keys.items():
key = eval_dict_items(x)
values = [ eval_dict_items(k) for k in y ]
table['_keys'][key] = values
# Expand range keys
expand_integer_range_key(table)
return table
def templates():
"""
Lists the name of the available code table templates
Returns
-------
list
Code table template aliases
"""
tables = glob.glob(os.path.join(templates_path,'*.json'))
return [ os.path.basename(x).split(".")[0] for x in tables ]
def copy_template(table, out_dir = None,out_path = None):
"""
Copies a code table template to an output
file or path
Parameters
----------
table : str
Code table template name to copy
Keyword Arguments
-----------------
out_dir : dict, opt
Directory to copy code table file template to
out_path : dict, opt
Full filename to copy code table file template to
Either out_dir or out_path must be provided
"""
tables = templates()
if table in tables:
table_path = os.path.join(templates_path,table + '.json')
......@@ -110,21 +180,6 @@ def eval_dict_items(item):
except:
return item
def read_table(table_path):
with open(table_path) as fileObj:
table = json.load(fileObj)
keys_path = ".".join([".".join(table_path.split('.')[:-1]),'keys'])
if os.path.isfile(keys_path):
with open(keys_path) as fileObj:
table_keys = json.load(fileObj)
table['_keys'] = {}
for x,y in table_keys.items():
key = eval_dict_items(x)
values = [ eval_dict_items(k) for k in y ]
table['_keys'][key] = values
expand_integer_range_key(table)
return table
def table_keys(table):
separator = '∿' # something hopefully not in keys...
if table.get('_keys'):
......
......@@ -209,8 +209,8 @@ def templates():
def copy_template(schema, out_dir = None,out_path = None):
"""
Creates a simple attribute dictionary for the elements
in a dataframe from its data model schema
Copies a schema file template to an output
file or path
Parameters
----------
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Manages the integral sequence in data file reading
from a data model:
- Access to data model
......@@ -11,7 +11,6 @@ from a data model:
- Output
Contains the following functions:
* ERV - does the actual extraction, read and validation of data input data
* main - the main function of the script
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 09:38:17 2019
Splits string reports in sections using a data model layout.
Input and output are simple pandas dataframes, with the output dataframe
column names being the section names
To work with a pandas TextParser, loop through this module.
Internally works assuming highest complexity in the input data model:
multiple non sequential sections
DEV NOTES:
1) make sure we use Series when working with Series, DataFrames otherwise...
like now:
threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
instead of:
threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
on data import in import_data.py, we use pd.read_fwf because is more general
use, also support to chunking would make converting to series a bit dirty...
2) Can we extend (do we need to?) this to reading sequential sections with
no sentinals? apparently (see td11) we are already able to do that:
provided the section is in a sequential parsing_order group
@author: iregon
Have to documents the threads approach!!!!
"""
#"""
#Created on Tue Apr 30 09:38:17 2019
#
#Splits string reports in sections using a data model layout.
#
#Input and output are simple pandas dataframes, with the output dataframe
#column names being the section names
#
#To work with a pandas TextParser, loop through this module.
#
#Internally works assuming highest complexity in the input data model:
#multiple non sequential sections
#
#DEV NOTES:
#
#1) make sure we use Series when working with Series, DataFrames otherwise...
#like now:
# threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
#instead of:
# threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
#
#on data import in import_data.py, we use pd.read_fwf because is more general
#use, also support to chunking would make converting to series a bit dirty...
#
#2) Can we extend (do we need to?) this to reading sequential sections with
# no sentinals? apparently (see td11) we are already able to do that:
# provided the section is in a sequential parsing_order group
#
#@author: iregon
#
#Have to documents the threads approach!!!!
#
#"""
import pandas as pd
from copy import deepcopy
......@@ -200,6 +200,34 @@ def extract_sections(string_df):
# MAIN
# ---------------------------------------------------------------------------
def main(string_df, schema, read_sections):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
string_df : pandas.DataFrame
Pandas dataframe with a unique column with
the reports as a block string
schema : dict
Data source data model schema
read_sections : list
Sections to output from the complete report
Returns
-------
pandas.DataFrame
Dataframe with the report sections split
along the columns.
"""
global sentinals, section_lens, sentinals_lens
global parsing_order
# Proceed to split sections if more than one
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020
FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
AN ITERABLE WITH DATAFRAMES
INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
- a single dataframe in a list
- a pd.io.parsers.textfilereader
WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
to be stripped
@author: iregon
OPTIONS IN OLD DEVELOPMENT:
1. DLMT: delimiter = ',' default
names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
skiprows = skiprows, names = names, na_values = missing)
2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
this applies mainly when reading elements from sections, but we leave it also here
TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
"""
#"""
#Created on Fri Jan 10 13:17:43 2020
#
#FUNCTION TO PREPARE SOURCE DATA TO WHAT GET_SECTIONS() EXPECTS:
# AN ITERABLE WITH DATAFRAMES
#
#INPUT IS NOW ONLY A FILE PATH. COULD OPTIONALLY GET OTHER TYPE OBJECTS...
#
#OUTPUT IS AN ITERABLE, DEPENDING ON CHUNKSIZE BEING SET:
# - a single dataframe in a list
# - a pd.io.parsers.textfilereader
#
#
#WITH BASICALLY 1 RECORD (ONE OR MULTIPLE REPORTS) IN ONE LINE
#
#delimiter="\t" option in pandas.read_fwf avoids white spaces at tails
#to be stripped
#
#@author: iregon
#
#
#
#OPTIONS IN OLD DEVELOPMENT:
# 1. DLMT: delimiter = ',' default
# names = [ (x,y) for x in schema['sections'].keys() for y in schema['sections'][x]['elements'].keys()]
# missing = { x:schema['sections'][x[0]]['elements'][x[1]].get('missing_value') for x in names }
# TextParser = pd.read_csv(source,header = None, delimiter = delimiter, encoding = 'utf-8',
# dtype = 'object', skip_blank_lines = False, chunksize = chunksize,
# skiprows = skiprows, names = names, na_values = missing)
#
# 2. FWF:# delimiter = '\t' so that it reads blanks as blanks, otherwise reads as empty: NaN
# this applies mainly when reading elements from sections, but we leave it also here
# TextParser = pd.read_fwf(source,widths=[FULL_WIDTH],header = None, skiprows = skiprows, delimiter="\t", chunksize = chunksize)
#
#"""
import pandas as pd
import os
......@@ -42,7 +42,38 @@ import os
from .. import properties
def main(source,chunksize = None, skiprows = None):
"""
Returns an iterable object with a pandas dataframe from
an input data source. The pandas dataframe has a report
per row and a single column with the full report as a
block string.
Currently only supports a data file path as source data,
but could be easily extended to accept a different
source object.
Parameters
----------
source : str
Path to data file
Keyword Arguments
-----------------
chunksize : int, opt
Number of lines to chunk the input data into
skiprows : int, opt
Number of lines to skip from input file
Returns
-------
iterable
List of with a single pandas dataframe
or pandas.io.parsers.textfilereader
"""
if os.path.isfile(source):
TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
if not chunksize:
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 13:17:43 2020
Extracts and reads (decodes, scales, etc...) the elements of data sections.
Each column of the input dataframe is a section with all its elements stored
as a single string.
Working on a section by section basis, this module uses the data model
information provided in the schema to split the elements, decode and scale them
where appropriate and ensure its data type consistency.
Output is a dataframe with columns as follows depending on the data model
structure:
1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
2) Data model with no sections[element0...element1]
DEV NOTES:
1) the 'quoted' issue: in version 1.0:
# Writing options from quoting on to prevent supp buoy data to be quoted:
# maybe this happenned because buoy data has commas, and pandas makes its own decission about
# how to write that.....
#https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
# quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
But we were still experiencing problems when reading fully empty sections, now
we only write to the section buffer reports that are not empty. We afterwards
recover the indexes....
@author: iregon
"""
#"""
#Created on Fri Jan 10 13:17:43 2020
#
#Extracts and reads (decodes, scales, etc...) the elements of data sections.
#Each column of the input dataframe is a section with all its elements stored
#as a single string.
#
#Working on a section by section basis, this module uses the data model
#information provided in the schema to split the elements, decode and scale them
#where appropriate and ensure its data type consistency.
#
#Output is a dataframe with columns as follows depending on the data model
#structure:
# 1) Data model with sections (1 or more): [(section0,element0),.......(sectionN,elementM)]
# 2) Data model with no sections[element0...element1]
#
#
#DEV NOTES:
#1) the 'quoted' issue: in version 1.0:
# # Writing options from quoting on to prevent supp buoy data to be quoted:
# # maybe this happenned because buoy data has commas, and pandas makes its own decission about
# # how to write that.....
# #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
# # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
# sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
#
# But we were still experiencing problems when reading fully empty sections, now
# we only write to the section buffer reports that are not empty. We afterwards
# recover the indexes....
#
#@author: iregon
#"""
import pandas as pd
from io import StringIO as StringIO
......@@ -80,7 +80,36 @@ def read_data(section_df,section_schema):
return section_df,section_valid
def main(sections_df, schema):
"""
Returns a pandas dataframe with a report per row
and the report sections split along the columns.
Each section is a block string and only the sections
listed in read_sections parameter are output.
Parameters
----------
sections_df : pandas.DataFrame
Pandas dataframe with a column per report sections.
The sections in the columns as a block strings.
schema : dict
Data source data model schema
Returns
-------
data : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
mask : pandas.DataFrame
Dataframe with the report section elements split
along the columns. Multiindex if bla, regular index
if ble
dtypes : dict
Dictionary with pandas data types for each of the
output elements
"""
multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False
data_df = pd.DataFrame(index = sections_df.index)
valid_df = pd.DataFrame(index = sections_df.index)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment