get_sections.py 11.7 KB
Newer Older
iregon's avatar
iregon committed
1 2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
iregon's avatar
iregon committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
#"""
#Created on Tue Apr 30 09:38:17 2019
#
#Splits string reports in sections using a data model layout.
#
#Input and output are simple pandas dataframes, with the output dataframe
#column names being the section names
#
#To work with a pandas TextParser, loop through this module.
#
#Internally works assuming highest complexity in the input data model:
#multiple non sequential sections
#
#DEV NOTES:
#    
#1) make sure we use Series when working with Series, DataFrames otherwise...
#like now:
#  threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len])
#instead of:
#  threads[thread_id]['data'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[0:section_len])
#
#on data import in import_data.py, we use pd.read_fwf because is more general
#use, also support to chunking would make converting to series a bit dirty...
#
#2) Can we extend (do we need to?) this to reading sequential sections with
#    no sentinals? apparently (see td11) we are already able to do that:
#        provided the section is in a sequential parsing_order group
#
#@author: iregon
#
#Have to documents the threads approach!!!!
#
#"""
iregon's avatar
iregon committed
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131

import pandas as pd
from copy import deepcopy
import logging

#   ---------------------------------------------------------------------------
#   FUNCTIONS TO PERFORM INITIAL SEPARATION OF SECTIONS: MAIN IS GET_SECTIONS()
#   ---------------------------------------------------------------------------
def extract_data():
    section_len = section_lens.get(threads[thread_id]['section'])
    if section_len:
        threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:section_len]) # object consistency needed here
        threads[thread_id]['modulo'] = pd.DataFrame(threads[thread_id]['parent_data'][0].str[section_len:]) # object consistency needed here
    else:
        threads[thread_id]['data'] = pd.Series(threads[thread_id]['parent_data'][0].str[0:]) #threads[thread_id]['parent_data'].copy()
        # Could even be like with section_len (None in section_len will read to the end)
        threads[thread_id]['modulo'] = pd.DataFrame(columns = [0]) # Just for consistency
    del threads[thread_id]['parent_data']

def add_next_children():
    global children_parsing_order, branch_parsing_order, children_group_type, children_group_number
    children_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
    branch_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
    children_group_type = list(children_parsing_order[0])[0]
    children_group_number = threads[thread_id]['children_group_number']
    threads[thread_id]['children_no'] = 0
    threads[thread_id]['children'] = []
    add_children()

def add_higher_group_children():
    global children_parsing_order, branch_parsing_order, children_group_type, children_group_number
    children_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
    children_parsing_order.pop(0) # Move to next group of sections
    if len(children_parsing_order) > 0:
        branch_parsing_order = deepcopy(threads[thread_id]['parsing_order'])
        branch_parsing_order.pop(0)
        children_group_type = list(children_parsing_order[0])[0]
        children_group_number = threads[thread_id]['children_group_number'] + 1
        add_children()

def add_children():
    if children_group_type == 's':
        add_static_children()
    else:
        add_dynamic_children()

def add_static_children():
    threads[thread_id]['children_no'] += 1
    children_thread_id = str(children_group_number) + str(0) + thread_id
    threads[thread_id]['children'].append(children_thread_id)
    # Now build children's thread
    children_section = children_parsing_order[0][children_group_type].pop(0)
    grandchildren_group_number = children_group_number
    if len(children_parsing_order[0][children_group_type]) == 0:
        children_parsing_order.pop(0)
        if len(children_parsing_order) > 0:
            grandchildren_group_number += 1
        else:
            grandchildren_group_number = None
    threads[children_thread_id] = {'parsing_order':children_parsing_order}
    threads[children_thread_id]['group_number'] = children_group_number
    threads[children_thread_id]['group_type'] = children_group_type
    threads[children_thread_id]['section'] = children_section
    threads[children_thread_id]['parent_data'] = threads[thread_id]['modulo']
    threads[thread_id]['modulo'].iloc[0:0] # Remove reports from modulo
    threads[children_thread_id]['children_group_number'] = grandchildren_group_number

def add_dynamic_children():
    for i in range(0,len(children_parsing_order[0][children_group_type])):
        branch_i_parsing_order = deepcopy(branch_parsing_order)
        children_thread_id = str(children_group_number) + str(i+1) + thread_id
        # Now build children's thread
        children_section = children_parsing_order[0][children_group_type].pop(0)
        children_idx = threads[thread_id]['modulo'].loc[threads[thread_id]['modulo'][0].str[0:sentinals_lens.get(children_section)] == sentinals.get(children_section)].index
        if len(children_idx) == 0:
            continue
        threads[thread_id]['children'].append(children_thread_id)
        threads[thread_id]['children_no'] += 1
        branch_i_parsing_order[0][children_group_type].remove(children_section)
        grandchildren_group_number = children_group_number
        if len(branch_i_parsing_order[0][children_group_type]) == 0 or children_group_type == 'e':
            branch_i_parsing_order.pop(0)
            if len(children_parsing_order) > 0:
                grandchildren_group_number += 1
            else:
                grandchildren_group_number = None
        threads[children_thread_id] = {'parsing_order':branch_i_parsing_order}
        threads[children_thread_id]['group_number'] = children_group_number
        threads[children_thread_id]['group_type'] = children_group_type
        threads[children_thread_id]['section'] = children_section
        threads[children_thread_id]['parent_data'] = threads[thread_id]['modulo'].loc[children_idx]
        threads[thread_id]['modulo'].drop(children_idx,inplace = True)
        threads[children_thread_id]['children_group_number'] = grandchildren_group_number
    if (len(threads[thread_id]['modulo'])) > 0:
        add_higher_group_children()

iregon's avatar
iregon committed
132
def extract_sections(string_df):
iregon's avatar
iregon committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
    # threads elements:
    #    'parsing_order'            What needs to be applied to current parent data
    #    'group_number'             Order in the global parsing order
    #    'group_type'               Is it sequential, exclusive or optional
    #    'section'                  Section name to be extracted from parent_data to data
    #    'parent_data'              Inital data from which section must be extracted
    #    'data'                     Section data extracted from parent_data
    #    'modulo'                   Reminder of parent_data after extracting section (data)
    #    'children_no'              Number of children threads to build, based on next parsing order list element. Resets to number of active children
    #    'children'                 Thread id for every child
    #    'children_group_number'    Group number (in the global parsing order, of the children)
    global sentinals, section_lens, sentinal_lens, parsing_order
    global children_group_type
    global threads
    global thread_id
    global group_type

    # Initial "node': input data
    threads = dict()
    thread_id = '00'
    threads_queue = [thread_id]
    threads[thread_id] = {'parsing_order':parsing_order}
    threads[thread_id]['group_number'] = 0
    threads[thread_id]['group_type'] = None
    threads[thread_id]['section'] = None
iregon's avatar
iregon committed
158
    threads[thread_id]['parent_data'] = string_df
iregon's avatar
iregon committed
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
    threads[thread_id]['data'] = None
    threads[thread_id]['modulo'] = threads[thread_id]['parent_data']
    del threads[thread_id]['parent_data']
    threads[thread_id]['children_group_number'] = 1
    add_next_children()
    threads_queue.extend(threads[thread_id]['children'])
    threads_queue.remove(thread_id)
    # And now, once initialized, let it grow:
    logging.info('Processing section partitioning threads')
    while threads_queue:
        thread_id = threads_queue[0]
        logging.info('{} ...'.format(thread_id))
        group_type = threads[thread_id]['group_type']
        # get section data
        extract_data()
        # kill thread if nothing there
        if len(threads[thread_id]['data']) == 0:
            del threads[thread_id]
            logging.info('{} deleted: no data'.format(thread_id))
            threads_queue.pop(0)
            continue
        # build children threads
        if len(threads[thread_id]['parsing_order']) > 0  and len(threads[thread_id]['modulo']) > 0:
            add_next_children()
            threads_queue.extend(threads[thread_id]['children'])
            #del threads[thread_id]['modulo'] # not until we control what to do whit leftovers....
        threads_queue.pop(0)
        logging.info('done')
    section_dict = dict()
    section_groups = [ d[x] for d in parsing_order for x in d.keys() ]
    sections = [item for sublist in section_groups for item in sublist]
190
    
iregon's avatar
iregon committed
191 192 193 194 195 196 197 198 199 200 201
    for section in sections:
        section_dict[section] = pd.Series()
        thread_ids = [ x for x in threads.keys() if threads[x]['section'] == section ]
        for thread_id in thread_ids:
            section_dict[section] = section_dict[section].append(threads[thread_id]['data'],ignore_index=False)
        section_dict[section].sort_index(inplace=True)
    return section_dict

#   ---------------------------------------------------------------------------
#   MAIN
#   ---------------------------------------------------------------------------
202
def main(string_df, schema, read_sections):
iregon's avatar
iregon committed
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
    """

    Returns a pandas dataframe with a report per row
    and the report sections split along the columns.
    Each section is a block string and only the sections
    listed in read_sections parameter are output.
    
    Parameters
    ----------
    string_df : pandas.DataFrame
        Pandas dataframe with a unique column with
        the reports as a block string
        
    schema : dict 
        Data source data model schema 
    
    read_sections : list 
        Sections to output from the complete report
        

    Returns
    -------
    pandas.DataFrame 
        Dataframe with the report sections split 
        along the columns.
    

    """
iregon's avatar
iregon committed
231 232
    global sentinals, section_lens, sentinals_lens
    global parsing_order
iregon's avatar
iregon committed
233
    # Proceed to split sections if more than one
234
    # else return section in a named column
iregon's avatar
iregon committed
235 236 237 238 239
    if len(schema['sections'].keys())> 1:
        section_lens = { section: schema['sections'][section]['header'].get('length') for section in schema['sections'].keys()}
        sentinals = { section: schema['sections'][section]['header'].get('sentinal') for section in schema['sections'].keys()}
        sentinals_lens = { section: len(sentinals.get(section)) if sentinals.get(section) else 0 for section in sentinals.keys()}
        parsing_order = schema['header']['parsing_order']
240 241 242 243 244 245 246 247
        # Get sections separated: section dict has a key:value pair for each
        # section in the data model. If the section does not exist in the data,
        # the value is an empty pd.Series
        section_dict = extract_sections(string_df)
        # Paste in order (as read_sections) in a single dataframe with columns
        # named as sections:
        # - Drop unwanted sections
        # - Keep requested but non-existent sections
iregon's avatar
iregon committed
248 249
        df_out = pd.DataFrame()
        for section in read_sections:
250
            df_out = pd.concat([df_out,section_dict[section].rename(section)],sort = False,axis=1)
iregon's avatar
iregon committed
251
    else:
iregon's avatar
iregon committed
252
        df_out = string_df
iregon's avatar
iregon committed
253 254 255
        df_out.columns = read_sections

    return df_out