Skip to content

clinops.preprocess

clinops.preprocess.outliers.ClinicalOutlierClipper

ClinicalOutlierClipper(
    bounds=None,
    action="clip",
    extra_bounds=None,
    strict=False,
)

Detect and clip physiologically impossible values in clinical DataFrames.

Uses published physiological bounds to identify values that are impossible regardless of patient state. Values outside bounds are either clipped to the boundary (default) or replaced with NaN.

Parameters:

Name Type Description Default
bounds dict[str, BoundSpec] | None

Dict mapping column name to BoundSpec. Defaults to combined VITAL_BOUNDS + LAB_BOUNDS. Pass a custom dict to override.

None
action str

What to do with out-of-range values: - "clip" : replace with the boundary value (default) - "null" : replace with NaN - "flag" : add a boolean {col}_outlier column, do not modify value

'clip'
extra_bounds dict[str, BoundSpec] | None

Additional BoundSpec entries to merge with the default bounds. Useful for site-specific or assay-specific ranges.

None
strict bool

If True, raise ValueError when a column in bounds is not found in the DataFrame. If False (default), silently skip missing cols.

False

Examples:

>>> clipper = ClinicalOutlierClipper()
>>> clean_df = clipper.fit_transform(vitals_df)
>>> print(clipper.report())
Source code in clinops/preprocess/outliers.py
def __init__(
    self,
    bounds: dict[str, BoundSpec] | None = None,
    action: str = "clip",
    extra_bounds: dict[str, BoundSpec] | None = None,
    strict: bool = False,
) -> None:
    if action not in ("clip", "null", "flag"):
        raise ValueError(f"action must be 'clip', 'null', or 'flag' — got {action!r}")

    self._bounds = {**VITAL_BOUNDS, **LAB_BOUNDS}
    if bounds is not None:
        self._bounds = bounds
    if extra_bounds:
        self._bounds.update(extra_bounds)

    self.action = action
    self.strict = strict
    self._report: list[dict[str, Any]] = []

fit_transform

fit_transform(df)

Clip or flag outliers in df using the configured bounds.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame. Only columns present in bounds are processed.

required

Returns:

Type Description
DataFrame

DataFrame with outliers handled according to action.

Source code in clinops/preprocess/outliers.py
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Clip or flag outliers in df using the configured bounds.

    Parameters
    ----------
    df:
        Input DataFrame. Only columns present in bounds are processed.

    Returns
    -------
    pd.DataFrame
        DataFrame with outliers handled according to ``action``.
    """
    df = df.copy()
    self._report = []

    for col, spec in self._bounds.items():
        if col not in df.columns:
            if self.strict:
                raise ValueError(f"Column '{col}' not found in DataFrame")
            continue

        series = df[col]
        if not pd.api.types.is_numeric_dtype(series):
            continue

        low_mask = series < spec.low
        high_mask = series > spec.high
        n_low = int(low_mask.sum())
        n_high = int(high_mask.sum())

        if n_low == 0 and n_high == 0:
            continue

        self._report.append(
            {
                "column": col,
                "low_outliers": n_low,
                "high_outliers": n_high,
                "total_outliers": n_low + n_high,
                "pct_outliers": round(100 * (n_low + n_high) / len(series), 3),
                "bound_low": spec.low,
                "bound_high": spec.high,
                "unit": spec.unit,
            }
        )

        logger.debug(
            f"{col}: {n_low} below {spec.low}{spec.unit}, "
            f"{n_high} above {spec.high}{spec.unit} → action={self.action}"
        )

        if self.action == "clip":
            df[col] = series.clip(lower=spec.low, upper=spec.high)
        elif self.action == "null":
            df.loc[low_mask | high_mask, col] = np.nan
        elif self.action == "flag":
            df[f"{col}_outlier"] = (low_mask | high_mask).astype(int)

    n_affected = sum(r["total_outliers"] for r in self._report)
    if n_affected:
        logger.info(
            f"ClinicalOutlierClipper: {n_affected:,} outlier values across "
            f"{len(self._report)} columns (action={self.action})"
        )
    else:
        logger.info("ClinicalOutlierClipper: no outliers detected")

    return df

report

report()

Return a summary DataFrame of all detected outliers.

Returns an empty DataFrame if fit_transform has not been called or no outliers were detected.

Source code in clinops/preprocess/outliers.py
def report(self) -> pd.DataFrame:
    """
    Return a summary DataFrame of all detected outliers.

    Returns an empty DataFrame if fit_transform has not been called
    or no outliers were detected.
    """
    if not self._report:
        return pd.DataFrame(
            columns=[
                "column",
                "low_outliers",
                "high_outliers",
                "total_outliers",
                "pct_outliers",
                "bound_low",
                "bound_high",
                "unit",
            ]
        )
    return pd.DataFrame(self._report).sort_values("total_outliers", ascending=False)

add_bounds

add_bounds(col, low, high, unit='')

Add or replace a bound for a specific column.

Source code in clinops/preprocess/outliers.py
def add_bounds(self, col: str, low: float, high: float, unit: str = "") -> None:
    """Add or replace a bound for a specific column."""
    self._bounds[col] = BoundSpec(col, low, high, unit)

clinops.preprocess.units.UnitNormalizer

UnitNormalizer(
    column_unit_map=None,
    explicit_conversions=None,
    target_units=None,
)

Normalize clinical measurements to canonical units.

Detects non-standard units via a companion unit column or explicit mapping and converts values in-place.

Parameters:

Name Type Description Default
column_unit_map dict[str, str] | None

Dict mapping value column name → unit column name. e.g. {"glucose": "glucose_unit"} tells the normalizer to read units from the glucose_unit column.

None
explicit_conversions dict[str, ConversionSpec] | None

Dict mapping value column name → ConversionSpec to apply unconditionally (ignores unit columns). e.g. {"temperature": UNIT_CONVERSIONS["temperature__f__c"]}

None
target_units dict[str, str] | None

Dict mapping column name → target unit string. Defaults to _CANONICAL_UNITS for known columns.

None

Examples:

Normalize a glucose column that is mixed mg/dL and mmol/L:

>>> normalizer = UnitNormalizer(column_unit_map={"glucose": "glucose_unit"})
>>> df = normalizer.transform(df)

Convert all temperatures from °F to °C unconditionally:

>>> normalizer = UnitNormalizer(
...     explicit_conversions={"temperature": UNIT_CONVERSIONS["temperature__f__c"]}
... )
>>> df = normalizer.transform(df)
Source code in clinops/preprocess/units.py
def __init__(
    self,
    column_unit_map: dict[str, str] | None = None,
    explicit_conversions: dict[str, ConversionSpec] | None = None,
    target_units: dict[str, str] | None = None,
) -> None:
    self._column_unit_map = column_unit_map or {}
    self._explicit = explicit_conversions or {}
    self._target_units = {**_CANONICAL_UNITS, **(target_units or {})}
    self._converted: list[dict[str, Any]] = []

transform

transform(df)

Apply unit normalization to df.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame. Modified copy is returned.

required

Returns:

Type Description
DataFrame
Source code in clinops/preprocess/units.py
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply unit normalization to df.

    Parameters
    ----------
    df:
        Input DataFrame. Modified copy is returned.

    Returns
    -------
    pd.DataFrame
    """
    df = df.copy()
    self._converted = []

    # Explicit unconditional conversions
    for col, spec in self._explicit.items():
        if col not in df.columns:
            logger.warning(f"UnitNormalizer: column '{col}' not found — skipping")
            continue
        n_non_null = df[col].notna().sum()
        df[col] = spec.convert(df[col])
        self._converted.append(
            {
                "column": col,
                "from_unit": spec.from_unit,
                "to_unit": spec.to_unit,
                "n_converted": int(n_non_null),
                "method": "explicit",
            }
        )
        logger.info(f"UnitNormalizer: converted {col} from {spec.from_unit}{spec.to_unit}")

    # Unit-column-aware conversions
    for value_col, unit_col in self._column_unit_map.items():
        if value_col not in df.columns:
            logger.warning(f"UnitNormalizer: value column '{value_col}' not found — skipping")
            continue
        if unit_col not in df.columns:
            logger.warning(f"UnitNormalizer: unit column '{unit_col}' not found — skipping")
            continue

        target_unit = self._target_units.get(value_col)
        if target_unit is None:
            logger.warning(
                f"UnitNormalizer: no target unit configured for '{value_col}' — skipping"
            )
            continue

        unique_units = df[unit_col].dropna().unique()
        for from_unit in unique_units:
            if from_unit == target_unit:
                continue

            key = self._make_key(value_col, from_unit, target_unit)
            if key not in UNIT_CONVERSIONS:
                logger.warning(
                    f"UnitNormalizer: no conversion registered for "
                    f"'{value_col}' {from_unit}{target_unit} (key={key!r})"
                )
                continue

            spec = UNIT_CONVERSIONS[key]
            mask = df[unit_col] == from_unit
            n = int(mask.sum())
            df.loc[mask, value_col] = spec.convert(df.loc[mask, value_col])
            df.loc[mask, unit_col] = target_unit

            self._converted.append(
                {
                    "column": value_col,
                    "from_unit": from_unit,
                    "to_unit": target_unit,
                    "n_converted": n,
                    "method": "unit_column",
                }
            )
            logger.info(
                f"UnitNormalizer: converted {n:,} rows of '{value_col}' "
                f"from {from_unit}{target_unit}"
            )

    return df

report

report()

Return a summary of all conversions applied.

Source code in clinops/preprocess/units.py
def report(self) -> pd.DataFrame:
    """Return a summary of all conversions applied."""
    if not self._converted:
        return pd.DataFrame(columns=["column", "from_unit", "to_unit", "n_converted", "method"])
    return pd.DataFrame(self._converted)

available_conversions staticmethod

available_conversions()

Return all registered conversion keys.

Source code in clinops/preprocess/units.py
@staticmethod
def available_conversions() -> list[str]:
    """Return all registered conversion keys."""
    return sorted(UNIT_CONVERSIONS.keys())

clinops.preprocess.icd.ICDMapper

ICDMapper(mappings=None, default_value=None)

Map ICD-9-CM diagnosis codes to ICD-10-CM equivalents.

Parameters:

Name Type Description Default
mappings list[tuple[str, str, str]] | None

Custom list of (icd9, icd10, description) tuples. If None, uses the built-in curated mapping table.

None
default_value str | None

Value to use when no mapping is found. Default None (NaN in DataFrame).

None

Examples:

Map a column of ICD-9 codes to ICD-10:

>>> mapper = ICDMapper()
>>> df["icd10"] = mapper.map_series(df["icd9_code"])

Map in-place with version detection:

>>> df = mapper.harmonize(df, code_col="icd_code", version_col="icd_version")

Get the ICD-10 chapter for a code:

>>> mapper.chapter("I2510")
'Diseases of the circulatory system'
Source code in clinops/preprocess/icd.py
def __init__(
    self,
    mappings: list[tuple[str, str, str]] | None = None,
    default_value: str | None = None,
) -> None:
    source = mappings or _BUILTIN_MAPPINGS
    self._icd9_to_10: dict[str, str] = {r[0]: r[1] for r in source}
    self._icd10_to_9: dict[str, list[str]] = {}
    for icd9, icd10, _ in source:
        self._icd10_to_9.setdefault(icd10, []).append(icd9)
    self._descriptions: dict[str, str] = {r[0]: r[2] for r in source}
    self.default_value = default_value
    logger.debug(f"ICDMapper loaded {len(self._icd9_to_10)} ICD-9→10 mappings")

n_mappings property

n_mappings

Number of ICD-9 → ICD-10 mappings loaded.

from_gem_file classmethod

from_gem_file(path)

Load from a CMS General Equivalence Mapping (GEM) file.

The official CMS GEM files are available at: https://www.cms.gov/medicare/coding-billing/icd-10-codes

The file should be a fixed-width or tab-delimited text file with columns: icd9_code, icd10_code, flags.

Parameters:

Name Type Description Default
path str | Path

Path to the CMS GEM forward mapping file (2018 format).

required
Source code in clinops/preprocess/icd.py
@classmethod
def from_gem_file(cls, path: str | Path) -> ICDMapper:
    """
    Load from a CMS General Equivalence Mapping (GEM) file.

    The official CMS GEM files are available at:
    https://www.cms.gov/medicare/coding-billing/icd-10-codes

    The file should be a fixed-width or tab-delimited text file with
    columns: icd9_code, icd10_code, flags.

    Parameters
    ----------
    path:
        Path to the CMS GEM forward mapping file (2018 format).
    """
    path = Path(path)
    mappings = []
    with open(path) as fh:
        for line in fh:
            parts = line.strip().split()
            if len(parts) >= 2:
                mappings.append((parts[0], parts[1], ""))
    logger.info(f"Loaded {len(mappings):,} mappings from {path.name}")
    return cls(mappings=mappings)

map_code

map_code(icd9_code)

Map a single ICD-9 code to its ICD-10 equivalent.

Parameters:

Name Type Description Default
icd9_code str

ICD-9-CM code string (with or without decimal point).

required

Returns:

Type Description
str or None

ICD-10-CM code, or default_value if not found.

Source code in clinops/preprocess/icd.py
def map_code(self, icd9_code: str) -> str | None:
    """
    Map a single ICD-9 code to its ICD-10 equivalent.

    Parameters
    ----------
    icd9_code:
        ICD-9-CM code string (with or without decimal point).

    Returns
    -------
    str or None
        ICD-10-CM code, or ``default_value`` if not found.
    """
    normalized = self._normalize_code(icd9_code)
    return self._icd9_to_10.get(normalized, self.default_value)

map_series

map_series(series)

Map a Series of ICD-9 codes to ICD-10.

Parameters:

Name Type Description Default
series Series

String Series of ICD-9-CM codes.

required

Returns:

Type Description
Series

ICD-10-CM codes. Unmapped codes become default_value.

Source code in clinops/preprocess/icd.py
def map_series(self, series: pd.Series) -> pd.Series:
    """
    Map a Series of ICD-9 codes to ICD-10.

    Parameters
    ----------
    series:
        String Series of ICD-9-CM codes.

    Returns
    -------
    pd.Series
        ICD-10-CM codes. Unmapped codes become ``default_value``.
    """
    normalized = series.astype(str).str.replace(".", "", regex=False).str.strip()
    mapped = normalized.map(self._icd9_to_10)
    n_unmapped = mapped.isna().sum()
    if n_unmapped:
        logger.debug(f"ICDMapper: {n_unmapped:,} codes had no ICD-10 mapping")
    return mapped

harmonize

harmonize(
    df,
    code_col,
    version_col,
    icd9_value="9",
    icd10_value="10",
    output_col=None,
)

Harmonize a mixed ICD-9/ICD-10 column to ICD-10 in-place.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
code_col str

Column containing ICD codes.

required
version_col str

Column indicating ICD version for each row.

required
icd9_value str

Value in version_col indicating ICD-9 (default "9").

'9'
icd10_value str

Value in version_col indicating ICD-10 (default "10").

'10'
output_col str | None

Column to write harmonized codes to. Defaults to code_col.

None

Returns:

Type Description
DataFrame
Source code in clinops/preprocess/icd.py
def harmonize(
    self,
    df: pd.DataFrame,
    code_col: str,
    version_col: str,
    icd9_value: str = "9",
    icd10_value: str = "10",
    output_col: str | None = None,
) -> pd.DataFrame:
    """
    Harmonize a mixed ICD-9/ICD-10 column to ICD-10 in-place.

    Parameters
    ----------
    df:
        Input DataFrame.
    code_col:
        Column containing ICD codes.
    version_col:
        Column indicating ICD version for each row.
    icd9_value:
        Value in ``version_col`` indicating ICD-9 (default ``"9"``).
    icd10_value:
        Value in ``version_col`` indicating ICD-10 (default ``"10"``).
    output_col:
        Column to write harmonized codes to. Defaults to ``code_col``.

    Returns
    -------
    pd.DataFrame
    """
    df = df.copy()
    out_col = output_col or code_col

    icd9_mask = df[version_col].astype(str) == str(icd9_value)
    n_icd9 = int(icd9_mask.sum())
    n_icd10 = int((df[version_col].astype(str) == str(icd10_value)).sum())

    logger.info(
        f"ICDMapper.harmonize: {n_icd9:,} ICD-9 rows, {n_icd10:,} ICD-10 rows "
        f"in column '{code_col}'"
    )

    if n_icd9 > 0:
        df.loc[icd9_mask, out_col] = self.map_series(df.loc[icd9_mask, code_col]).values

    return df

chapter

chapter(icd10_code)

Return the ICD-10 chapter description for a code.

Parameters:

Name Type Description Default
icd10_code str

ICD-10-CM code string (e.g. "I2510").

required

Returns:

Type Description
str

Chapter description, or "Unknown" if code is out of range.

Source code in clinops/preprocess/icd.py
def chapter(self, icd10_code: str) -> str:
    """
    Return the ICD-10 chapter description for a code.

    Parameters
    ----------
    icd10_code:
        ICD-10-CM code string (e.g. ``"I2510"``).

    Returns
    -------
    str
        Chapter description, or ``"Unknown"`` if code is out of range.
    """
    code = icd10_code.strip().upper()
    prefix = re.match(r"[A-Z]\d{2}", code)
    if not prefix:
        return "Unknown"
    code3 = prefix.group(0)

    for start, end, description in _ICD10_CHAPTERS:
        if start <= code3 <= end:
            return description
    return "Unknown"

chapter_series

chapter_series(series)

Map a Series of ICD-10 codes to their chapter descriptions.

Source code in clinops/preprocess/icd.py
def chapter_series(self, series: pd.Series) -> pd.Series:
    """Map a Series of ICD-10 codes to their chapter descriptions."""
    return series.apply(self.chapter)

describe

describe(icd9_code)

Return the description for an ICD-9 code.

Source code in clinops/preprocess/icd.py
def describe(self, icd9_code: str) -> str:
    """Return the description for an ICD-9 code."""
    return self._descriptions.get(self._normalize_code(icd9_code), "No description available")