filter_operator

Operator to Filter rows or days from patients data

class tasrif.processing_pipeline.custom.filter_operator.FilterOperator(participant_identifier='id', date_feature_name='time', epoch_filter=None, day_filter=None, participant_filter=None, filter_type='include')

Examples

>>> import pandas as pd
>>> import numpy as np
>>> import datetime
>>>
>>> from tasrif.processing_pipeline.custom import FilterOperator
>>>
>>> df = pd.DataFrame({
...         'Hours': pd.date_range('2018-01-01', '2018-01-10', freq='1H', closed='left'),
...         'Steps': np.random.randint(100,10000, size=9*24),
...         }
...      )
>>>
>>> ids = []
>>> for i in range(1, 217):
...     ids.append(i%10 + 1)
>>>
>>> df["Id"] = ids
>>>
>>>
>>> # Add day for id 1
>>> df = df.append({'Hours': datetime.datetime(2020, 2, 2), 'Steps': 2000, 'Id': 1}, ignore_index=True)
>>>
>>> # Remove 5 days from id 10
>>> id_10_indices = df.loc[df.Id == 10].index.values[:-5]
>>> df = df[~df.index.isin(id_10_indices)]
>>>
>>> operator = FilterOperator(participant_identifier="Id",
...                           date_feature_name="Hours",
...                           epoch_filter=lambda df: df['Steps'] > 10,
...                           day_filter={
...                               "column": "Hours",
...                               "filter": lambda x: x.count() < 10,
...                               "consecutive_days": (7, 12) # 7 minimum consecutive days, and 12 max
...                           },
...                           filter_type="include")
>>> operator.process(df)[0]
Hours   Steps   Id
0   2018-01-01 09:00:00     6232    1
1   2018-01-01 19:00:00     4623    1
2   2018-01-02 05:00:00     4094    1
3   2018-01-02 15:00:00     1800    1
4   2018-01-03 01:00:00     1861    1
...     ...     ...     ...
190     2018-01-07 23:00:00     9116    9
191     2018-01-08 09:00:00     7265    9
192     2018-01-08 19:00:00     4608    9
193     2018-01-09 05:00:00     8709    9
194     2018-01-09 15:00:00     8970    9
>>> df = pd.DataFrame([
...     [1, "2020-05-01 00:00:00", "1", "3"],
...     [1, "2020-05-01 01:00:00", "1", "5" ],
...     [2, "2020-05-01 03:00:00", "2", "3"],
...     [2, "2020-05-02 00:00:00", "1", "10"],
...     [3, "2020-05-02 01:00:00", "1", "0"],
...     [4, "2020-05-03 01:00:00", "1", "0"]],
...     columns=['logId', 'timestamp', 'sleep_level', 'awake_count'])
>>>
>>> op = FilterParticipantsOperator(participant_identifier="logId",
...                                 participants=[1, 3],
...                                 filter_type="include",)
>>> df1 = op.process(df)
>>> df1[0]
logId   timestamp   sleep_level     awake_count
0   1   2020-05-01 00:00:00     1   3
1   1   2020-05-01 01:00:00     1   5
4   3   2020-05-02 01:00:00     1   0
__init__(participant_identifier='id', date_feature_name='time', epoch_filter=None, day_filter=None, participant_filter=None, filter_type='include')

Initializes the operator

Parameters
  • participant_identifier (str) – patient identifier column

  • date_feature_name (str) – time series column

  • epoch_filter (str, callable) – row filter

  • day_filter (str, callable) – filter the days per patient

  • participant_filter (list) – participants to include or exclude

  • filter_type (str) – include the filtered epochs, and days, or exclude them.