From bc30184ab7c9f09ba421fdc7eb24e67f0b1adb56 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 10 Feb 2021 19:07:36 +0000 Subject: [PATCH 1/9] Saving tables as parquet --- conda-recipe/meta.yaml | 3 ++- src/reportengine/table.py | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 78215e6..e7eff99 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -20,11 +20,12 @@ requirements: - jinja2 - ruamel_yaml =0.15 - matplotlib - - pandas >=1 + - pandas >=1.2.0 - pygments - blessings - curio - pandoc >=2 + - pyarrow test: requires: diff --git a/src/reportengine/table.py b/src/reportengine/table.py index c5ec6b8..5343a83 100644 --- a/src/reportengine/table.py +++ b/src/reportengine/table.py @@ -64,10 +64,28 @@ def prepare_path(*, spec, namespace,environment, **kwargs): path = environment.table_folder / (name + '.csv') return {'path': path} -def savetable(df, path): +def savetable(df, path, format=None): """Final action to save figures, with a nice filename""" log.debug("Writing table %s" % path) - df.to_csv(str(path), sep='\t', na_rep='nan') + + if format in (None, "parquet"): # Default to parquet format + # Need to change the type of each level to str + log.debug("Changing column types to str") + cols = df.columns + for i in range(cols.nlevels): + str_col = cols.levels[i].astype(str) + # Could use inplace but it's + # going to bedeprecated + cols = cols.set_levels(str_col, i) + df.columns = cols + df.to_parquet(str(path)) + elif format == "csv": + df.to_csv(str(path), sep='\t', na_rep='nan') + else: + raise NotImplementedError( + f"Unrecognised format {format}", + "choose one of parquet or csv" + ) return Table.fromdf(df, path=path) def savetablelist(dfs, path): From a83536d8b35bc3d0182823cc78d69ef3d850de75 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 10 Feb 2021 19:41:23 +0000 Subject: [PATCH 2/9] Sometimes we don't have a multiindex --- src/reportengine/table.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/reportengine/table.py b/src/reportengine/table.py index 5343a83..b912824 100644 --- a/src/reportengine/table.py +++ b/src/reportengine/table.py @@ -72,11 +72,14 @@ def savetable(df, path, format=None): # Need to change the type of each level to str log.debug("Changing column types to str") cols = df.columns - for i in range(cols.nlevels): - str_col = cols.levels[i].astype(str) - # Could use inplace but it's - # going to bedeprecated - cols = cols.set_levels(str_col, i) + if isinstance(cols, pd.MultiIndex): + for i in range(cols.nlevels): + str_col = cols.levels[i].astype(str) + # Could use inplace but it's + # going to bedeprecated + cols = cols.set_levels(str_col, i) + else: + cols = cols.astype(str) df.columns = cols df.to_parquet(str(path)) elif format == "csv": From 85e689501743ec971627f99e6b93e9a3a00f4692 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 16 Feb 2021 13:00:44 +0000 Subject: [PATCH 3/9] refactoring --- src/reportengine/table.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/reportengine/table.py b/src/reportengine/table.py index b912824..f2fa4d6 100644 --- a/src/reportengine/table.py +++ b/src/reportengine/table.py @@ -57,7 +57,19 @@ def as_markdown(self): res = re.sub('\n\s+', '\n', res) return res - +def str_columns(df): + log.debug("Changing column types to str") + cols = df.columns + if isinstance(cols, pd.MultiIndex): + for i in range(cols.nlevels): + str_col = cols.levels[i].astype(str) + # Could use inplace but it's + # going to bedeprecated + cols = cols.set_levels(str_col, i) + else: + cols = cols.astype(str) + df.columns = cols + return df def prepare_path(*, spec, namespace,environment, **kwargs): name = spec_to_nice_name(namespace, spec) @@ -70,17 +82,7 @@ def savetable(df, path, format=None): if format in (None, "parquet"): # Default to parquet format # Need to change the type of each level to str - log.debug("Changing column types to str") - cols = df.columns - if isinstance(cols, pd.MultiIndex): - for i in range(cols.nlevels): - str_col = cols.levels[i].astype(str) - # Could use inplace but it's - # going to bedeprecated - cols = cols.set_levels(str_col, i) - else: - cols = cols.astype(str) - df.columns = cols + df = str_columns(df) df.to_parquet(str(path)) elif format == "csv": df.to_csv(str(path), sep='\t', na_rep='nan') From 0de36c997d29b12d11455649314b3403d5b02722 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 18 Feb 2021 12:15:37 +0000 Subject: [PATCH 4/9] Raising exception instead of silent conversion --- src/reportengine/table.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/reportengine/table.py b/src/reportengine/table.py index f2fa4d6..d58e03a 100644 --- a/src/reportengine/table.py +++ b/src/reportengine/table.py @@ -81,9 +81,15 @@ def savetable(df, path, format=None): log.debug("Writing table %s" % path) if format in (None, "parquet"): # Default to parquet format - # Need to change the type of each level to str - df = str_columns(df) - df.to_parquet(str(path)) + try: + df.to_parquet(str(path)) + except ValueError as e: + # Need to change the type of each level to str + raise ValueError( + "To save a table in parquet format the column entries must all be of type str. " + "Consider using the helper function reportengine.table.str_columns before passing the " + "dataframe to the savetable function." + ) from e elif format == "csv": df.to_csv(str(path), sep='\t', na_rep='nan') else: From e6e5d69d936d83b8498f3293d050bcbb58905727 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 18 Feb 2021 12:52:10 +0000 Subject: [PATCH 5/9] Handling table format in CLI --- src/reportengine/app.py | 3 +++ src/reportengine/table.py | 11 ++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/reportengine/app.py b/src/reportengine/app.py index 84e0565..8b71d35 100644 --- a/src/reportengine/app.py +++ b/src/reportengine/app.py @@ -182,6 +182,9 @@ def argparser(self): help="additional providers from which to " "load actions. Must be an importable specifiaction.") + parser.add_argument('--table-format', default='parquet', choices=["parquet", "csv"], + help="Format to save tables as. Note csv is the only human readable format.") + parallel = parser.add_mutually_exclusive_group() parallel.add_argument('--parallel', action='store_true', help="execute actions in parallel") diff --git a/src/reportengine/table.py b/src/reportengine/table.py index d58e03a..8026759 100644 --- a/src/reportengine/table.py +++ b/src/reportengine/table.py @@ -71,16 +71,17 @@ def str_columns(df): df.columns = cols return df -def prepare_path(*, spec, namespace,environment, **kwargs): +def prepare_path(*, spec, namespace, environment, **kwargs): + suffix = environment.extra_args['table_format'] name = spec_to_nice_name(namespace, spec) - path = environment.table_folder / (name + '.csv') + path = environment.table_folder / (name + '.' + suffix) return {'path': path} -def savetable(df, path, format=None): +def savetable(df, path): """Final action to save figures, with a nice filename""" log.debug("Writing table %s" % path) - - if format in (None, "parquet"): # Default to parquet format + format = path.suffix[1:] + if format == "parquet": # Default to parquet format try: df.to_parquet(str(path)) except ValueError as e: From c3b6c07ea3699aed2053d623a42cb1a15558222b Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 18 Feb 2021 14:11:26 +0000 Subject: [PATCH 6/9] Adding option to save multiple table formats --- src/reportengine/app.py | 8 +++--- src/reportengine/environment.py | 11 ++++++-- src/reportengine/table.py | 47 ++++++++++++++++----------------- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/src/reportengine/app.py b/src/reportengine/app.py index 8b71d35..c9952d7 100644 --- a/src/reportengine/app.py +++ b/src/reportengine/app.py @@ -175,16 +175,16 @@ def argparser(self): help='matplotlib style file to override the built-in one.', default=None) - parser.add_argument('--formats', nargs='+', help="formats of the output figures", + parser.add_argument('--figure-formats', nargs='+', help="formats of the output figures", default=('png', 'pdf',)) + parser.add_argument('--table-formats', nargs='+', default=('parquet',), choices=["parquet", "csv"], + help="Format to save tables as. Note csv is the only human readable format.") + parser.add_argument('-x', '--extra-providers', nargs='+', help="additional providers from which to " "load actions. Must be an importable specifiaction.") - parser.add_argument('--table-format', default='parquet', choices=["parquet", "csv"], - help="Format to save tables as. Note csv is the only human readable format.") - parallel = parser.add_mutually_exclusive_group() parallel.add_argument('--parallel', action='store_true', help="execute actions in parallel") diff --git a/src/reportengine/environment.py b/src/reportengine/environment.py index 4e567a8..1a3a71c 100644 --- a/src/reportengine/environment.py +++ b/src/reportengine/environment.py @@ -30,7 +30,8 @@ class EnvironmentError_(Exception): pass } class Environment: - def __init__(self, *, output=None, formats=('pdf',), + def __init__(self, *, output=None, + figure_formats=('pdf',), table_formats=('parquet',), default_figure_format=None, loglevel=logging.DEBUG, config_yml = None, **kwargs): @@ -38,7 +39,9 @@ def __init__(self, *, output=None, formats=('pdf',), self.output_path = pathlib.Path(output).absolute() else: self.output_path = output - self.figure_formats = formats + + self.table_formats = table_formats + self.figure_formats = figure_formats self._default_figure_format = default_figure_format self.loglevel = loglevel self.extra_args = kwargs @@ -111,6 +114,10 @@ def get_figure_paths(self, handle): for fmt in self.figure_formats: yield self.figure_folder / (handle + '.' + fmt) + def get_table_paths(self, handle): + for fmt in self.table_formats: + yield self.table_folder / (handle + '.' + fmt) + @classmethod def ns_dump_description(cls): return dict( diff --git a/src/reportengine/table.py b/src/reportengine/table.py index 8026759..18eb3a1 100644 --- a/src/reportengine/table.py +++ b/src/reportengine/table.py @@ -72,32 +72,31 @@ def str_columns(df): return df def prepare_path(*, spec, namespace, environment, **kwargs): - suffix = environment.extra_args['table_format'] - name = spec_to_nice_name(namespace, spec) - path = environment.table_folder / (name + '.' + suffix) - return {'path': path} + paths = environment.get_table_paths(spec_to_nice_name(namespace, spec)) + return {'paths': list(paths)} -def savetable(df, path): +def savetable(df, paths): """Final action to save figures, with a nice filename""" - log.debug("Writing table %s" % path) - format = path.suffix[1:] - if format == "parquet": # Default to parquet format - try: - df.to_parquet(str(path)) - except ValueError as e: - # Need to change the type of each level to str - raise ValueError( - "To save a table in parquet format the column entries must all be of type str. " - "Consider using the helper function reportengine.table.str_columns before passing the " - "dataframe to the savetable function." - ) from e - elif format == "csv": - df.to_csv(str(path), sep='\t', na_rep='nan') - else: - raise NotImplementedError( - f"Unrecognised format {format}", - "choose one of parquet or csv" - ) + for path in paths: + log.debug("Writing table %s" % path) + format = path.suffix[1:] + if format == "parquet": # Default to parquet format + try: + df.to_parquet(str(path)) + except ValueError as e: + # Need to change the type of each level to str + raise ValueError( + "To save a table in parquet format the column entries must all be of type str. " + "Consider using the helper function reportengine.table.str_columns before passing the " + "dataframe to the savetable function." + ) from e + elif format == "csv": + df.to_csv(str(path), sep='\t', na_rep='nan') + else: + raise NotImplementedError( + f"Unrecognised format {format}", + "choose one of parquet or csv" + ) return Table.fromdf(df, path=path) def savetablelist(dfs, path): From 869ad511cdc242ee9813ff3a2fe8c4323d6529b6 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 31 Mar 2021 15:39:01 +0100 Subject: [PATCH 7/9] Removing pyarrow as a reportengine dependency Reverting pandas version to 1 --- conda-recipe/meta.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index e7eff99..78215e6 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -20,12 +20,11 @@ requirements: - jinja2 - ruamel_yaml =0.15 - matplotlib - - pandas >=1.2.0 + - pandas >=1 - pygments - blessings - curio - pandoc >=2 - - pyarrow test: requires: From 76bd0a0822b08cc0b32fcc5b7898b119591b2dba Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 31 Mar 2021 15:41:34 +0100 Subject: [PATCH 8/9] Changing default table format to CSV --- src/reportengine/app.py | 2 +- src/reportengine/environment.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/reportengine/app.py b/src/reportengine/app.py index c9952d7..95c6b72 100644 --- a/src/reportengine/app.py +++ b/src/reportengine/app.py @@ -178,7 +178,7 @@ def argparser(self): parser.add_argument('--figure-formats', nargs='+', help="formats of the output figures", default=('png', 'pdf',)) - parser.add_argument('--table-formats', nargs='+', default=('parquet',), choices=["parquet", "csv"], + parser.add_argument('--table-formats', nargs='+', default=('csv',), choices=["parquet", "csv"], help="Format to save tables as. Note csv is the only human readable format.") parser.add_argument('-x', '--extra-providers', nargs='+', diff --git a/src/reportengine/environment.py b/src/reportengine/environment.py index 1a3a71c..004dd1a 100644 --- a/src/reportengine/environment.py +++ b/src/reportengine/environment.py @@ -31,7 +31,7 @@ class EnvironmentError_(Exception): pass class Environment: def __init__(self, *, output=None, - figure_formats=('pdf',), table_formats=('parquet',), + figure_formats=('pdf',), table_formats=('csv',), default_figure_format=None, loglevel=logging.DEBUG, config_yml = None, **kwargs): From 7d759ba38127aa2cbc6cdf771f3a650cdd13fc6e Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 31 Mar 2021 15:57:29 +0100 Subject: [PATCH 9/9] Adding check to see if parquet is installed --- src/reportengine/app.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/reportengine/app.py b/src/reportengine/app.py index 95c6b72..714da5e 100644 --- a/src/reportengine/app.py +++ b/src/reportengine/app.py @@ -284,6 +284,16 @@ def init(self, cmdline=None): import faulthandler faulthandler.enable() args = self.get_commandline_arguments(cmdline) + if 'parquet' in args['table_formats']: + try: + import pyarrow + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Failed to import module pyarrow. " + "This is a required dependency to save " + "tables in the parquet format. " + "Please run conda install pyarrow and try again." + ) self.init_logging(args) sys.excepthook = self.excepthook try: