291 lines
14 KiB
Python
291 lines
14 KiB
Python
"""
|
|
Functions for serialize Pandas DataFrame.
|
|
|
|
Much of the code here is inspired by that in the aioinflux packet found here: https://github.com/gusutabopb/aioinflux
|
|
"""
|
|
|
|
import logging
|
|
import math
|
|
import re
|
|
|
|
from influxdb_client import WritePrecision
|
|
from influxdb_client.client.write.point import _ESCAPE_KEY, _ESCAPE_STRING, _ESCAPE_MEASUREMENT, DEFAULT_WRITE_PRECISION
|
|
|
|
logger = logging.getLogger('influxdb_client.client.write.dataframe_serializer')
|
|
|
|
|
|
def _itertuples(data_frame):
|
|
cols = [data_frame.iloc[:, k] for k in range(len(data_frame.columns))]
|
|
return zip(data_frame.index, *cols)
|
|
|
|
|
|
class DataframeSerializer:
|
|
"""Serialize DataFrame into LineProtocols."""
|
|
|
|
def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION, chunk_size: int = None,
|
|
**kwargs) -> None:
|
|
"""
|
|
Init serializer.
|
|
|
|
:param data_frame: Pandas DataFrame to serialize
|
|
:param point_settings: Default Tags
|
|
:param precision: The precision for the unix timestamps within the body line-protocol.
|
|
:param chunk_size: The size of chunk for serializing into chunks.
|
|
:key data_frame_measurement_name: name of measurement for writing Pandas DataFrame
|
|
:key data_frame_tag_columns: list of DataFrame columns which are tags, rest columns will be fields
|
|
:key data_frame_timestamp_column: name of DataFrame column which contains a timestamp. The column can be defined as a :class:`~str` value
|
|
formatted as `2018-10-26`, `2018-10-26 12:00`, `2018-10-26 12:00:00-05:00`
|
|
or other formats and types supported by `pandas.to_datetime <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html#pandas.to_datetime>`_ - ``DataFrame``
|
|
:key data_frame_timestamp_timezone: name of the timezone which is used for timestamp column - ``DataFrame``
|
|
""" # noqa: E501
|
|
# This function is hard to understand but for good reason:
|
|
# the approach used here is considerably more efficient
|
|
# than the alternatives.
|
|
#
|
|
# We build up a Python expression that efficiently converts a data point
|
|
# tuple into line-protocol entry, and then evaluate the expression
|
|
# as a lambda so that we can call it. This avoids the overhead of
|
|
# invoking a function on every data value - we only have one function
|
|
# call per row instead. The expression consists of exactly
|
|
# one f-string, so we build up the parts of it as segments
|
|
# that are concatenated together to make the full f-string inside
|
|
# the lambda.
|
|
#
|
|
# Things are made a little more complex because fields and tags with NaN
|
|
# values and empty tags are omitted from the generated line-protocol
|
|
# output.
|
|
#
|
|
# As an example, say we have a data frame with two value columns:
|
|
# a float
|
|
# b int
|
|
#
|
|
# This will generate a lambda expression to be evaluated that looks like
|
|
# this:
|
|
#
|
|
# lambda p: f"""{measurement_name} {keys[0]}={p[1]},{keys[1]}={p[2]}i {p[0].value}"""
|
|
#
|
|
# This lambda is then executed for each row p.
|
|
#
|
|
# When NaNs are present, the expression looks like this (split
|
|
# across two lines to satisfy the code-style checker)
|
|
#
|
|
# lambda p: f"""{measurement_name} {"" if pd.isna(p[1])
|
|
# else f"{keys[0]}={p[1]}"},{keys[1]}={p[2]}i {p[0].value}"""
|
|
#
|
|
# When there's a NaN value in column a, we'll end up with a comma at the start of the
|
|
# fields, so we run a regexp substitution after generating the line-protocol entries
|
|
# to remove this.
|
|
#
|
|
# We're careful to run these potentially costly extra steps only when NaN values actually
|
|
# exist in the data.
|
|
|
|
from ...extras import pd, np
|
|
if not isinstance(data_frame, pd.DataFrame):
|
|
raise TypeError('Must be DataFrame, but type was: {0}.'
|
|
.format(type(data_frame)))
|
|
|
|
data_frame_measurement_name = kwargs.get('data_frame_measurement_name')
|
|
if data_frame_measurement_name is None:
|
|
raise TypeError('"data_frame_measurement_name" is a Required Argument')
|
|
|
|
timestamp_column = kwargs.get('data_frame_timestamp_column', None)
|
|
timestamp_timezone = kwargs.get('data_frame_timestamp_timezone', None)
|
|
data_frame = data_frame.copy(deep=False)
|
|
data_frame_timestamp = data_frame.index if timestamp_column is None else data_frame[timestamp_column]
|
|
if isinstance(data_frame_timestamp, pd.PeriodIndex):
|
|
data_frame_timestamp = data_frame_timestamp.to_timestamp()
|
|
else:
|
|
# TODO: this is almost certainly not what you want
|
|
# when the index is the default RangeIndex.
|
|
# Instead, it would probably be better to leave
|
|
# out the timestamp unless a time column is explicitly
|
|
# enabled.
|
|
data_frame_timestamp = pd.to_datetime(data_frame_timestamp, unit=precision)
|
|
|
|
if timestamp_timezone:
|
|
if isinstance(data_frame_timestamp, pd.DatetimeIndex):
|
|
data_frame_timestamp = data_frame_timestamp.tz_localize(timestamp_timezone)
|
|
else:
|
|
data_frame_timestamp = data_frame_timestamp.dt.tz_localize(timestamp_timezone)
|
|
|
|
if hasattr(data_frame_timestamp, 'tzinfo') and data_frame_timestamp.tzinfo is None:
|
|
data_frame_timestamp = data_frame_timestamp.tz_localize('UTC')
|
|
if timestamp_column is None:
|
|
data_frame.index = data_frame_timestamp
|
|
else:
|
|
data_frame[timestamp_column] = data_frame_timestamp
|
|
|
|
data_frame_tag_columns = kwargs.get('data_frame_tag_columns')
|
|
data_frame_tag_columns = set(data_frame_tag_columns or [])
|
|
|
|
# keys holds a list of string keys.
|
|
keys = []
|
|
# tags holds a list of tag f-string segments ordered alphabetically by tag key.
|
|
tags = []
|
|
# fields holds a list of field f-string segments ordered alphebetically by field key
|
|
fields = []
|
|
# field_indexes holds the index into each row of all the fields.
|
|
field_indexes = []
|
|
|
|
if point_settings.defaultTags:
|
|
for key, value in point_settings.defaultTags.items():
|
|
# Avoid overwriting existing data if there's a column
|
|
# that already exists with the default tag's name.
|
|
# Note: when a new column is added, the old DataFrame
|
|
# that we've made a shallow copy of is unaffected.
|
|
# TODO: when there are NaN or empty values in
|
|
# the column, we could make a deep copy of the
|
|
# data and fill in those values with the default tag value.
|
|
if key not in data_frame.columns:
|
|
data_frame[key] = value
|
|
data_frame_tag_columns.add(key)
|
|
|
|
# Get a list of all the columns sorted by field/tag key.
|
|
# We want to iterate through the columns in sorted order
|
|
# so that we know when we're on the first field so we
|
|
# can know whether a comma is needed for that
|
|
# field.
|
|
columns = sorted(enumerate(data_frame.dtypes.items()), key=lambda col: col[1][0])
|
|
|
|
# null_columns has a bool value for each column holding
|
|
# whether that column contains any null (NaN or None) values.
|
|
null_columns = data_frame.isnull().any()
|
|
timestamp_index = 0
|
|
|
|
# Iterate through the columns building up the expression for each column.
|
|
for index, (key, value) in columns:
|
|
key = str(key)
|
|
key_format = f'{{keys[{len(keys)}]}}'
|
|
keys.append(key.translate(_ESCAPE_KEY))
|
|
# The field index is one more than the column index because the
|
|
# time index is at column zero in the finally zipped-together
|
|
# result columns.
|
|
field_index = index + 1
|
|
val_format = f'p[{field_index}]'
|
|
|
|
if key in data_frame_tag_columns:
|
|
# This column is a tag column.
|
|
if null_columns.iloc[index]:
|
|
key_value = f"""{{
|
|
'' if {val_format} == '' or pd.isna({val_format}) else
|
|
f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}'
|
|
}}"""
|
|
else:
|
|
key_value = f',{key_format}={{str({val_format}).translate(_ESCAPE_KEY)}}'
|
|
tags.append(key_value)
|
|
continue
|
|
elif timestamp_column is not None and key in timestamp_column:
|
|
timestamp_index = field_index
|
|
continue
|
|
|
|
# This column is a field column.
|
|
# Note: no comma separator is needed for the first field.
|
|
# It's important to omit it because when the first
|
|
# field column has no nulls, we don't run the comma-removal
|
|
# regexp substitution step.
|
|
sep = '' if len(field_indexes) == 0 else ','
|
|
if issubclass(value.type, np.integer) or issubclass(value.type, np.floating) or issubclass(value.type, np.bool_): # noqa: E501
|
|
suffix = 'i' if issubclass(value.type, np.integer) else ''
|
|
if null_columns.iloc[index]:
|
|
field_value = f"""{{"" if pd.isna({val_format}) else f"{sep}{key_format}={{{val_format}}}{suffix}"}}""" # noqa: E501
|
|
else:
|
|
field_value = f"{sep}{key_format}={{{val_format}}}{suffix}"
|
|
else:
|
|
if null_columns.iloc[index]:
|
|
field_value = f"""{{
|
|
'' if pd.isna({val_format}) else
|
|
f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'
|
|
}}"""
|
|
else:
|
|
field_value = f'''{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'''
|
|
field_indexes.append(field_index)
|
|
fields.append(field_value)
|
|
|
|
measurement_name = str(data_frame_measurement_name).translate(_ESCAPE_MEASUREMENT)
|
|
|
|
tags = ''.join(tags)
|
|
fields = ''.join(fields)
|
|
timestamp = '{p[%s].value}' % timestamp_index
|
|
if precision == WritePrecision.US:
|
|
timestamp = '{int(p[%s].value / 1e3)}' % timestamp_index
|
|
elif precision == WritePrecision.MS:
|
|
timestamp = '{int(p[%s].value / 1e6)}' % timestamp_index
|
|
elif precision == WritePrecision.S:
|
|
timestamp = '{int(p[%s].value / 1e9)}' % timestamp_index
|
|
|
|
f = eval(f'lambda p: f"""{{measurement_name}}{tags} {fields} {timestamp}"""', {
|
|
'measurement_name': measurement_name,
|
|
'_ESCAPE_KEY': _ESCAPE_KEY,
|
|
'_ESCAPE_STRING': _ESCAPE_STRING,
|
|
'keys': keys,
|
|
'pd': pd,
|
|
})
|
|
|
|
for k, v in dict(data_frame.dtypes).items():
|
|
if k in data_frame_tag_columns:
|
|
data_frame = data_frame.replace({k: ''}, np.nan)
|
|
|
|
def _any_not_nan(p, indexes):
|
|
return any(map(lambda x: not pd.isna(p[x]), indexes))
|
|
|
|
self.data_frame = data_frame
|
|
self.f = f
|
|
self.field_indexes = field_indexes
|
|
self.first_field_maybe_null = null_columns.iloc[field_indexes[0] - 1]
|
|
self._any_not_nan = _any_not_nan
|
|
|
|
#
|
|
# prepare chunks
|
|
#
|
|
if chunk_size is not None:
|
|
self.number_of_chunks = int(math.ceil(len(data_frame) / float(chunk_size)))
|
|
self.chunk_size = chunk_size
|
|
else:
|
|
self.number_of_chunks = None
|
|
|
|
def serialize(self, chunk_idx: int = None):
|
|
"""
|
|
Serialize chunk into LineProtocols.
|
|
|
|
:param chunk_idx: The index of chunk to serialize. If `None` then serialize whole dataframe.
|
|
"""
|
|
if chunk_idx is None:
|
|
chunk = self.data_frame
|
|
else:
|
|
logger.debug("Serialize chunk %s/%s ...", chunk_idx + 1, self.number_of_chunks)
|
|
chunk = self.data_frame[chunk_idx * self.chunk_size:(chunk_idx + 1) * self.chunk_size]
|
|
|
|
if self.first_field_maybe_null:
|
|
# When the first field is null (None/NaN), we'll have
|
|
# a spurious leading comma which needs to be removed.
|
|
lp = (re.sub('^(( |[^ ])* ),([a-zA-Z0-9])(.*)', '\\1\\3\\4', self.f(p))
|
|
for p in filter(lambda x: self._any_not_nan(x, self.field_indexes), _itertuples(chunk)))
|
|
return list(lp)
|
|
else:
|
|
return list(map(self.f, _itertuples(chunk)))
|
|
|
|
def number_of_chunks(self):
|
|
"""
|
|
Return the number of chunks.
|
|
|
|
:return: number of chunks or None if chunk_size is not specified.
|
|
"""
|
|
return self.number_of_chunks
|
|
|
|
|
|
def data_frame_to_list_of_points(data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION, **kwargs):
|
|
"""
|
|
Serialize DataFrame into LineProtocols.
|
|
|
|
:param data_frame: Pandas DataFrame to serialize
|
|
:param point_settings: Default Tags
|
|
:param precision: The precision for the unix timestamps within the body line-protocol.
|
|
:key data_frame_measurement_name: name of measurement for writing Pandas DataFrame
|
|
:key data_frame_tag_columns: list of DataFrame columns which are tags, rest columns will be fields
|
|
:key data_frame_timestamp_column: name of DataFrame column which contains a timestamp. The column can be defined as a :class:`~str` value
|
|
formatted as `2018-10-26`, `2018-10-26 12:00`, `2018-10-26 12:00:00-05:00`
|
|
or other formats and types supported by `pandas.to_datetime <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html#pandas.to_datetime>`_ - ``DataFrame``
|
|
:key data_frame_timestamp_timezone: name of the timezone which is used for timestamp column - ``DataFrame``
|
|
""" # noqa: E501
|
|
return DataframeSerializer(data_frame, point_settings, precision, **kwargs).serialize()
|