Source code for eBoruta.utils

import logging
import sys
import typing as t
from itertools import tee
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_regression

A = t.TypeVar("A")
B = t.TypeVar("B")



[docs]
def zip_partition(
    pred: t.Callable[[B], bool], a: t.Iterable[A], b: t.Iterable[B]
) -> t.Tuple[t.Iterator[A], t.Iterator[A]]:
    t1, t2 = tee((pred(y), x) for x, y in zip(a, b))
    return (
        (x for (cond, x) in t1 if not cond),
        (x for (cond, x) in t2 if cond),
    )




[docs]
def convert_to_array(a: t.Any) -> np.ndarray:
    """
    :param a: Any object.
    :return: An ``np.array(a)``.
    :raise TypeError: if the above fails.
    """
    try:
        return np.array(a)
    except Exception as e:
        raise TypeError(
            f"Input type is not supported: failed to convert type {type(a)} "
            f"into an array due to {e}"
        ) from e




[docs]
def get_duplicates(it: t.Iterable[A]) -> t.Iterator[A]:
    seen = []
    for x in it:
        if x in seen:
            seen.append(x)
            yield x




[docs]
def setup_logger(
    log_path: t.Optional[t.Union[str, Path]] = None,
    file_level: t.Optional[int] = None,
    stdout_level: t.Optional[int] = None,
    stderr_level: t.Optional[int] = None,
    logger: t.Optional[logging.Logger] = None,
) -> logging.Logger:
    formatter = logging.Formatter(
        "%(asctime)s %(levelname)s [%(module)s--%(funcName)s]: %(message)s"
    )
    if logger is None:
        logger = logging.getLogger(__name__)

    if log_path is not None:
        level = file_level or logging.DEBUG
        handler = logging.FileHandler(log_path, "w")
        handler.setFormatter(formatter)
        handler.setLevel(level)
        logger.addHandler(handler)
    if stderr_level is not None:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(formatter)
        handler.setLevel(stderr_level)
        logger.addHandler(handler)
    if stdout_level is not None:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(formatter)
        handler.setLevel(stdout_level)
        logger.addHandler(handler)

    return logger




[docs]
def sample_dataset(
    regression: bool = False, multiclass: bool = False, multitarget: bool = False
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Make a sample dataset with 30 features and 100 samples.

    :param regression: Regression objective. Otherwise, classification assumed.
    :param multiclass: Multiple (3) classes for classification.
    :param multitarget: Multiple (3) targets for regression.
    :return: DataFrame with predictors and DataFrame with response variables.
    """
    if regression:
        x, y = make_regression(
            n_features=30,
            n_informative=5,
            n_targets=3 if multitarget else 2,
        )
    else:
        x, y = make_classification(
            n_features=30,
            n_informative=5,
            n_repeated=2,
            n_redundant=3,
            n_classes=3 if multiclass else 2,
        )
    y_colnames = (
        ["Y"] if len(y.shape) == 1 else [f"Y_{i}" for i in range(1, y.shape[1] + 1)]
    )
    df_x = pd.DataFrame(x, columns=[f"X_{i}" for i in range(1, x.shape[1] + 1)])
    df_y = pd.DataFrame(y, columns=y_colnames)
    return df_x, df_y



if __name__ == "__main__":
    raise RuntimeError