Source code for isaricanalytics.utils

from __future__ import annotations

__all__ = [
    "clean_figure_table",
    "strip_html",
    "strip_nonstandard_unicode_chars",
]

# -- IMPORTS --

# -- Standard libraries --
import re
import typing

# -- 3rd party libraries --
import pandas

# -- Internal libraries --


[docs] def strip_html(value: typing.Any) -> str | typing.Any: """:py:class:`typing.Any` : Strip HTML elements from a value. Parameters ---------- value : typing.Any A value. Returns ------- str, typing.Any Either a string stripped of all HTML elements, or the original non- string value. """ if isinstance(value, str): return re.sub(r"<.*?>", "", value) return value
[docs] def strip_nonstandard_unicode_chars(value: typing.Any) -> str | typing.Any: """:py:class:`typing.Any` : Strip non-standard Unicode characters from a value. The non-standard Unicode characters of interest are defined within the function itself, and are currently limited to the "↳" (U+21B3) character, but may be extended to include other characters. Parameters ---------- value : typing.Any A value. Returns ------- str, typing.Any Either a string stripped of all non-standard Unicode characters, or the original non- string value. """ nonstandard_unicode_chars = "↳" if isinstance(value, str): return re.sub(rf"[{nonstandard_unicode_chars}]", "", value) return value
[docs] def clean_figure_table(figure_table: pandas.DataFrame) -> pandas.DataFrame: """:py:class:`pandas.DataFrame` : A cleaned figure table dataframe. The cleaning steps are unique to the Plotly graph object table format from which the table CSVs were originally, which contain HTML styling elements and non-standard (non-alphabetic) Unicode characters. The cleaning is the removal of such characters. Parameters ---------- figure_table : pandas.DataFrame The original figure table as a Pandas dataframe. Returns ------- pandas.DataFrame The cleaned figure table. """ # The use of `pandas.DataFrame.map` here is not absolutely optimal, as # `map` applies changes across the dataframe element-wise, but is the # safer choice given that the dataframe may contain a number of non-string # columns which cannot be known in advance, while the cleaning steps # currently only apply to string values. return figure_table.map(strip_html).map(strip_nonstandard_unicode_chars)