From c0251df1431f68c2f7e762e3baa6ef704fda19be Mon Sep 17 00:00:00 2001
From: perezgonzalez-irene <iregon@noc.ac.uk>
Date: Thu, 27 Feb 2020 10:05:24 +0000
Subject: [PATCH] Main functions as main in reader modules

---
 read.py                 | 12 +++++-----
 reader/get_sections.py  |  2 +-
 reader/import_data.py   |  2 +-
 reader/read_sections.py | 51 +++++++++++++++++++++--------------------
 4 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/read.py b/read.py
index efb41f0..0c0e57e 100644
--- a/read.py
+++ b/read.py
@@ -31,9 +31,9 @@ from io import StringIO as StringIO
 from .data_models import schemas
 from . import properties
 from .common import pandas_TextParser_hdlr
-from .reader import import_data
-from .reader import get_sections
-from .reader.read_sections import main as read_sections
+#from .reader import import_data
+#from .reader import get_sections
+from mdf_reader.reader import import_data, get_sections, read_sections
 from .validator import validate
 
 toolPath = os.path.dirname(os.path.abspath(__file__))
@@ -77,13 +77,13 @@ def ERV(TextParser,read_sections_list, schema, code_tables_path):
         # - requested NA sections as NaN columns
         # - columns(sections) order as in read_sections_list
         
-        sections_df = get_sections.get_sections(string_df, schema, read_sections_list)
+        sections_df = get_sections.main(string_df, schema, read_sections_list)
 
         # 2. Read elements from sections: along data chunks, resulting data types
         # may vary if gaps, keep track of data types: add Intxx pandas classes rather than intxx to avoid this!
         # Sections are parsed in the same order as sections_df.columns
         
-        [data_df, valid_df, out_dtypesi ] = read_sections(sections_df, schema)
+        [data_df, valid_df, out_dtypesi ] = read_sections.main(sections_df, schema)
         if i_chunk == 0:
             out_dtypes = copy.deepcopy(out_dtypesi)
 
@@ -271,7 +271,7 @@ def main(source, data_model = None, data_model_path = None, sections = None,chun
     # 2.2 Homogeneize input data to an iterable with dataframes:
     # a list with a single dataframe or a pd.io.parsers.TextFileReader
     logging.info("Getting data string from source...")
-    TextParser = import_data.import_data(source, chunksize = chunksize, skiprows = skiprows)
+    TextParser = import_data.main(source, chunksize = chunksize, skiprows = skiprows)
 
     # 2.3. Extract, read and validate data in same loop
     logging.info("Extracting and reading sections")
diff --git a/reader/get_sections.py b/reader/get_sections.py
index acb0f2b..e623475 100644
--- a/reader/get_sections.py
+++ b/reader/get_sections.py
@@ -199,7 +199,7 @@ def extract_sections(string_df):
 #   ---------------------------------------------------------------------------
 #   MAIN
 #   ---------------------------------------------------------------------------
-def get_sections(string_df, schema, read_sections):
+def main(string_df, schema, read_sections):
     global sentinals, section_lens, sentinals_lens
     global parsing_order
     # Proceed to split sections if more than one
diff --git a/reader/import_data.py b/reader/import_data.py
index 889c5f5..0d25197 100644
--- a/reader/import_data.py
+++ b/reader/import_data.py
@@ -41,7 +41,7 @@ import os
 
 from .. import properties
 
-def import_data(source,chunksize = None, skiprows = None):
+def main(source,chunksize = None, skiprows = None):
 
     if os.path.isfile(source):
         TextParser = pd.read_fwf(source,widths=[properties.MAX_FULL_REPORT_WIDTH],header = None, delimiter="\t", skiprows = skiprows, chunksize = chunksize)
diff --git a/reader/read_sections.py b/reader/read_sections.py
index 5beab89..3bbda03 100644
--- a/reader/read_sections.py
+++ b/reader/read_sections.py
@@ -7,7 +7,7 @@ Extracts and reads (decodes, scales, etc...) the elements of data sections.
 Each column of the input dataframe is a section with all its elements stored
 as a single string.
 
-Working on a section by section basis, this module uses the data model 
+Working on a section by section basis, this module uses the data model
 information provided in the schema to split the elements, decode and scale them
 where appropriate and ensure its data type consistency.
 
@@ -24,12 +24,12 @@ DEV NOTES:
  # how to write that.....
  #https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue
  # quoting=csv.QUOTE_NONE was failing when a section is empty (or just one record in a section,...)
- sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")   
+ sections_df[section].to_csv(section_buffer,header=False, encoding = 'utf-8',index = False,quoting=csv.QUOTE_NONE,escapechar="\\",sep="\t")
 
  But we were still experiencing problems when reading fully empty sections, now
  we only write to the section buffer reports that are not empty. We afterwards
  recover the indexes....
-    
+
 @author: iregon
 """
 
@@ -50,24 +50,25 @@ def extract_fixed_width(section_serie_bf,section_schema):
     section_elements = pd.read_fwf(section_serie_bf, widths = section_widths, header = None, names = section_names , na_values = section_missing, delimiter="\t", encoding = 'utf-8', dtype = 'object', skip_blank_lines = False )
     return section_elements
 
-def extract_delimited(section_serie_bf,section_schema): 
+def extract_delimited(section_serie_bf,section_schema):
     delimiter = section_schema['header'].get('delimiter')
     section_names = section_schema['elements'].keys()
     section_missing = { x:section_schema['elements'][x].get('missing_value') for x in section_names }
     section_elements = pd.read_csv(section_serie_bf,header = None, delimiter = delimiter, encoding = 'utf-8',
                                  dtype = 'object', skip_blank_lines = False,
                                  names = section_names, na_values = section_missing)
-    
+
     return section_elements
 
-def read_data(section_df,section_schema): 
+def read_data(section_df,section_schema):
     section_names = section_df.columns
     section_dtypes = { i:section_schema['elements'][i]['column_type'] for i in section_names }
     encoded = [ (x) for x in section_names if 'encoding' in section_schema['elements'][x]]
     section_encoding = { i:section_schema['elements'][i]['encoding'] for i in encoded }
     section_valid = pd.DataFrame(index = section_df.index, columns = section_df.columns)
-    
+
     for element in section_dtypes.keys():
+        print(element)
         missing = section_df[element].isna()
         if element in encoded:
             section_df[element] = decoders.get(section_encoding.get(element)).get(section_dtypes.get(element))(section_df[element])
@@ -76,29 +77,29 @@ def read_data(section_df,section_schema):
         section_df[element] = converters.get(section_dtypes.get(element))(section_df[element], **kwargs)
 
         section_valid[element] = missing | section_df[element].notna()
-             
+
     return section_df,section_valid
 
-def read_sections(sections_df, schema):
-    
+def main(sections_df, schema):
+
     multiindex = True if len(sections_df.columns) > 1 or sections_df.columns[0] != properties.dummy_level else False
     data_df = pd.DataFrame(index = sections_df.index)
     valid_df = pd.DataFrame(index = sections_df.index)
     out_dtypes = dict()
-    
-    for section in sections_df.columns: 
+
+    for section in sections_df.columns:
         print('Reading section {}'.format(section))
         section_schema = schema['sections'].get(section)
         disable_read = section_schema.get('header').get('disable_read')
-        
-        if not disable_read:     
+
+        if not disable_read:
             field_layout = section_schema.get('header').get('field_layout')
             ignore = [ i for i in section_schema['elements'].keys() if section_schema['elements'][i].get('ignore') ] # evals to True if set and true, evals to False if not set or set and false
              # Get rid of false delimiters in fixed_width
             delimiter = section_schema['header'].get('delimiter')
             if delimiter and field_layout == 'fixed_width':
                 sections_df[section] = sections_df[section].str.replace(delimiter,'')
-        
+
             section_buffer = StringIO()
             # Here indices are lost, have to give the real ones, those in section_strings:
             # we'll see if we do that in the caller module or here....
@@ -112,9 +113,9 @@ def read_sections(sections_df, schema):
                 section_elements_obj = extract_fixed_width(section_buffer,section_schema)
             elif field_layout == 'delimited':
                 section_elements_obj = extract_delimited(section_buffer,section_schema)
-                
+
             section_elements_obj.drop(ignore, axis = 1, inplace = True)
-            
+
             # Read the objects to their data types and apply decoding, scaling and so on...
             # Give them their actual indexes back
             section_elements, section_valid = read_data(section_elements_obj,section_schema)
@@ -124,30 +125,30 @@ def read_sections(sections_df, schema):
         else:
             section_elements = pd.DataFrame(sections_df[section],columns = [section])
             section_valid = pd.DataFrame(index = section_elements.index,data = True, columns = [section])
-      
-        
+
+
         section_elements.columns = [ (section, x) for x in section_elements.columns] if multiindex else section_elements.columns
         section_valid.columns = section_elements.columns
         data_df = pd.concat([data_df,section_elements],sort = False,axis=1)
         valid_df = pd.concat([valid_df,section_valid],sort = False,axis=1)
-        
+
     # We do the actual out_dtypes here: because the full indexing occurs only
     # after concat, NaN values may arise only in data_df if a section is
     # not existing in a given report!
     for section in sections_df.columns:
         section_schema = schema['sections'].get(section)
-        if not section_schema.get('header').get('disable_read'): 
+        if not section_schema.get('header').get('disable_read'):
             elements = [ x[1] for x in data_df.columns if x[0] == section ]
             if multiindex:
                 out_dtypes.update({ (section,i):properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
                 out_dtypes.update({ (section,i):data_df[(section,i)].dtype.name for i in elements if data_df[(section,i)].dtype.name in properties.numpy_floats})
             else:
-                out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } ) 
+                out_dtypes.update({ i:properties.pandas_dtypes.get(section_schema['elements'][i].get('column_type')) for i in elements } )
                 out_dtypes.update({ i:data_df[i].dtype.name for i in section_elements if data_df[i].dtype.name in properties.numpy_floats})
         else:
             if multiindex:
                     out_dtypes.update({ (section,section):'object' } )
             else:
-                out_dtypes.update({ section:'object' } ) 
-                
-    return data_df, valid_df, out_dtypes
+                out_dtypes.update({ section:'object' } )
+
+    return data_df, valid_df, out_dtypes
\ No newline at end of file
-- 
GitLab