clinops.preprocess¶

clinops.preprocess.outliers.ClinicalOutlierClipper ¶

ClinicalOutlierClipper(
    bounds=None,
    action="clip",
    extra_bounds=None,
    strict=False,
)

Detect and clip physiologically impossible values in clinical DataFrames.

Uses published physiological bounds to identify values that are impossible regardless of patient state. Values outside bounds are either clipped to the boundary (default) or replaced with NaN.

Parameters:

Name	Type	Description	Default
`bounds`	`dict[str, BoundSpec] \| None`	Dict mapping column name to BoundSpec. Defaults to combined VITAL_BOUNDS + LAB_BOUNDS. Pass a custom dict to override.	`None`
`action`	`str`	What to do with out-of-range values: - `"clip"` : replace with the boundary value (default) - `"null"` : replace with NaN - `"flag"` : add a boolean `{col}_outlier` column, do not modify value	`'clip'`
`extra_bounds`	`dict[str, BoundSpec] \| None`	Additional BoundSpec entries to merge with the default bounds. Useful for site-specific or assay-specific ranges.	`None`
`strict`	`bool`	If True, raise ValueError when a column in bounds is not found in the DataFrame. If False (default), silently skip missing cols.	`False`

Examples:

>>> clipper = ClinicalOutlierClipper()
>>> clean_df = clipper.fit_transform(vitals_df)
>>> print(clipper.report())

Source code in clinops/preprocess/outliers.py

def __init__(
    self,
    bounds: dict[str, BoundSpec] | None = None,
    action: str = "clip",
    extra_bounds: dict[str, BoundSpec] | None = None,
    strict: bool = False,
) -> None:
    if action not in ("clip", "null", "flag"):
        raise ValueError(f"action must be 'clip', 'null', or 'flag' — got {action!r}")

    self._bounds = {**VITAL_BOUNDS, **LAB_BOUNDS}
    if bounds is not None:
        self._bounds = bounds
    if extra_bounds:
        self._bounds.update(extra_bounds)

    self.action = action
    self.strict = strict
    self._report: list[dict[str, Any]] = []

fit_transform ¶

fit_transform(df)

Clip or flag outliers in df using the configured bounds.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame. Only columns present in bounds are processed.	required

Returns:

Type	Description
`DataFrame`	DataFrame with outliers handled according to `action`.

Source code in clinops/preprocess/outliers.py

def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Clip or flag outliers in df using the configured bounds.

    Parameters
    ----------
    df:
        Input DataFrame. Only columns present in bounds are processed.

    Returns
    -------
    pd.DataFrame
        DataFrame with outliers handled according to ``action``.
    """
    df = df.copy()
    self._report = []

    for col, spec in self._bounds.items():
        if col not in df.columns:
            if self.strict:
                raise ValueError(f"Column '{col}' not found in DataFrame")
            continue

        series = df[col]
        if not pd.api.types.is_numeric_dtype(series):
            continue

        low_mask = series < spec.low
        high_mask = series > spec.high
        n_low = int(low_mask.sum())
        n_high = int(high_mask.sum())

        if n_low == 0 and n_high == 0:
            continue

        self._report.append(
            {
                "column": col,
                "low_outliers": n_low,
                "high_outliers": n_high,
                "total_outliers": n_low + n_high,
                "pct_outliers": round(100 * (n_low + n_high) / len(series), 3),
                "bound_low": spec.low,
                "bound_high": spec.high,
                "unit": spec.unit,
            }
        )

        logger.debug(
            f"{col}: {n_low} below {spec.low}{spec.unit}, "
            f"{n_high} above {spec.high}{spec.unit} → action={self.action}"
        )

        if self.action == "clip":
            df[col] = series.clip(lower=spec.low, upper=spec.high)
        elif self.action == "null":
            df.loc[low_mask | high_mask, col] = np.nan
        elif self.action == "flag":
            df[f"{col}_outlier"] = (low_mask | high_mask).astype(int)

    n_affected = sum(r["total_outliers"] for r in self._report)
    if n_affected:
        logger.info(
            f"ClinicalOutlierClipper: {n_affected:,} outlier values across "
            f"{len(self._report)} columns (action={self.action})"
        )
    else:
        logger.info("ClinicalOutlierClipper: no outliers detected")

    return df

report ¶

report()

Return a summary DataFrame of all detected outliers.

Returns an empty DataFrame if fit_transform has not been called or no outliers were detected.

Source code in clinops/preprocess/outliers.py

def report(self) -> pd.DataFrame:
    """
    Return a summary DataFrame of all detected outliers.

    Returns an empty DataFrame if fit_transform has not been called
    or no outliers were detected.
    """
    if not self._report:
        return pd.DataFrame(
            columns=[
                "column",
                "low_outliers",
                "high_outliers",
                "total_outliers",
                "pct_outliers",
                "bound_low",
                "bound_high",
                "unit",
            ]
        )
    return pd.DataFrame(self._report).sort_values("total_outliers", ascending=False)

add_bounds ¶

add_bounds(col, low, high, unit='')

Add or replace a bound for a specific column.

Source code in clinops/preprocess/outliers.py

def add_bounds(self, col: str, low: float, high: float, unit: str = "") -> None:
    """Add or replace a bound for a specific column."""
    self._bounds[col] = BoundSpec(col, low, high, unit)

clinops.preprocess.units.UnitNormalizer ¶

UnitNormalizer(
    column_unit_map=None,
    explicit_conversions=None,
    target_units=None,
)

Normalize clinical measurements to canonical units.

Detects non-standard units via a companion unit column or explicit mapping and converts values in-place.

Parameters:

Name	Type	Description	Default
`column_unit_map`	`dict[str, str] \| None`	Dict mapping value column name → unit column name. e.g. `{"glucose": "glucose_unit"}` tells the normalizer to read units from the `glucose_unit` column.	`None`
`explicit_conversions`	`dict[str, ConversionSpec] \| None`	Dict mapping value column name → ConversionSpec to apply unconditionally (ignores unit columns). e.g. `{"temperature": UNIT_CONVERSIONS["temperature__f__c"]}`	`None`
`target_units`	`dict[str, str] \| None`	Dict mapping column name → target unit string. Defaults to `_CANONICAL_UNITS` for known columns.	`None`

Examples:

Normalize a glucose column that is mixed mg/dL and mmol/L:

>>> normalizer = UnitNormalizer(column_unit_map={"glucose": "glucose_unit"})
>>> df = normalizer.transform(df)

Convert all temperatures from °F to °C unconditionally:

>>> normalizer = UnitNormalizer(
...     explicit_conversions={"temperature": UNIT_CONVERSIONS["temperature__f__c"]}
... )
>>> df = normalizer.transform(df)

Source code in clinops/preprocess/units.py

def __init__(
    self,
    column_unit_map: dict[str, str] | None = None,
    explicit_conversions: dict[str, ConversionSpec] | None = None,
    target_units: dict[str, str] | None = None,
) -> None:
    self._column_unit_map = column_unit_map or {}
    self._explicit = explicit_conversions or {}
    self._target_units = {**_CANONICAL_UNITS, **(target_units or {})}
    self._converted: list[dict[str, Any]] = []

transform ¶

transform(df)

Apply unit normalization to df.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame. Modified copy is returned.	required

Returns:

Type	Description
`DataFrame`

Source code in clinops/preprocess/units.py

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply unit normalization to df.

    Parameters
    ----------
    df:
        Input DataFrame. Modified copy is returned.

    Returns
    -------
    pd.DataFrame
    """
    df = df.copy()
    self._converted = []

    # Explicit unconditional conversions
    for col, spec in self._explicit.items():
        if col not in df.columns:
            logger.warning(f"UnitNormalizer: column '{col}' not found — skipping")
            continue
        n_non_null = df[col].notna().sum()
        df[col] = spec.convert(df[col])
        self._converted.append(
            {
                "column": col,
                "from_unit": spec.from_unit,
                "to_unit": spec.to_unit,
                "n_converted": int(n_non_null),
                "method": "explicit",
            }
        )
        logger.info(f"UnitNormalizer: converted {col} from {spec.from_unit} → {spec.to_unit}")

    # Unit-column-aware conversions
    for value_col, unit_col in self._column_unit_map.items():
        if value_col not in df.columns:
            logger.warning(f"UnitNormalizer: value column '{value_col}' not found — skipping")
            continue
        if unit_col not in df.columns:
            logger.warning(f"UnitNormalizer: unit column '{unit_col}' not found — skipping")
            continue

        target_unit = self._target_units.get(value_col)
        if target_unit is None:
            logger.warning(
                f"UnitNormalizer: no target unit configured for '{value_col}' — skipping"
            )
            continue

        unique_units = df[unit_col].dropna().unique()
        for from_unit in unique_units:
            if from_unit == target_unit:
                continue

            key = self._make_key(value_col, from_unit, target_unit)
            if key not in UNIT_CONVERSIONS:
                logger.warning(
                    f"UnitNormalizer: no conversion registered for "
                    f"'{value_col}' {from_unit} → {target_unit} (key={key!r})"
                )
                continue

            spec = UNIT_CONVERSIONS[key]
            mask = df[unit_col] == from_unit
            n = int(mask.sum())
            df.loc[mask, value_col] = spec.convert(df.loc[mask, value_col])
            df.loc[mask, unit_col] = target_unit

            self._converted.append(
                {
                    "column": value_col,
                    "from_unit": from_unit,
                    "to_unit": target_unit,
                    "n_converted": n,
                    "method": "unit_column",
                }
            )
            logger.info(
                f"UnitNormalizer: converted {n:,} rows of '{value_col}' "
                f"from {from_unit} → {target_unit}"
            )

    return df

report ¶

report()

Return a summary of all conversions applied.

Source code in clinops/preprocess/units.py

def report(self) -> pd.DataFrame:
    """Return a summary of all conversions applied."""
    if not self._converted:
        return pd.DataFrame(columns=["column", "from_unit", "to_unit", "n_converted", "method"])
    return pd.DataFrame(self._converted)

available_conversions `staticmethod` ¶

available_conversions()

Return all registered conversion keys.

Source code in clinops/preprocess/units.py

@staticmethod
def available_conversions() -> list[str]:
    """Return all registered conversion keys."""
    return sorted(UNIT_CONVERSIONS.keys())

clinops.preprocess.icd.ICDMapper ¶

ICDMapper(mappings=None, default_value=None)

Map ICD-9-CM diagnosis codes to ICD-10-CM equivalents.

Parameters:

Name	Type	Description	Default
`mappings`	`list[tuple[str, str, str]] \| None`	Custom list of (icd9, icd10, description) tuples. If None, uses the built-in curated mapping table.	`None`
`default_value`	`str \| None`	Value to use when no mapping is found. Default `None` (NaN in DataFrame).	`None`

Examples:

Map a column of ICD-9 codes to ICD-10:

>>> mapper = ICDMapper()
>>> df["icd10"] = mapper.map_series(df["icd9_code"])

Map in-place with version detection:

>>> df = mapper.harmonize(df, code_col="icd_code", version_col="icd_version")

Get the ICD-10 chapter for a code:

>>> mapper.chapter("I2510")
'Diseases of the circulatory system'

Source code in clinops/preprocess/icd.py

def __init__(
    self,
    mappings: list[tuple[str, str, str]] | None = None,
    default_value: str | None = None,
) -> None:
    source = mappings or _BUILTIN_MAPPINGS
    self._icd9_to_10: dict[str, str] = {r[0]: r[1] for r in source}
    self._icd10_to_9: dict[str, list[str]] = {}
    for icd9, icd10, _ in source:
        self._icd10_to_9.setdefault(icd10, []).append(icd9)
    self._descriptions: dict[str, str] = {r[0]: r[2] for r in source}
    self.default_value = default_value
    logger.debug(f"ICDMapper loaded {len(self._icd9_to_10)} ICD-9→10 mappings")

n_mappings `property` ¶

n_mappings

Number of ICD-9 → ICD-10 mappings loaded.

from_gem_file `classmethod` ¶

from_gem_file(path)

Load from a CMS General Equivalence Mapping (GEM) file.

The official CMS GEM files are available at: https://www.cms.gov/medicare/coding-billing/icd-10-codes

The file should be a fixed-width or tab-delimited text file with columns: icd9_code, icd10_code, flags.

Parameters:

Name	Type	Description	Default
`path`	`str \| Path`	Path to the CMS GEM forward mapping file (2018 format).	required

Source code in clinops/preprocess/icd.py

@classmethod
def from_gem_file(cls, path: str | Path) -> ICDMapper:
    """
    Load from a CMS General Equivalence Mapping (GEM) file.

    The official CMS GEM files are available at:
    https://www.cms.gov/medicare/coding-billing/icd-10-codes

    The file should be a fixed-width or tab-delimited text file with
    columns: icd9_code, icd10_code, flags.

    Parameters
    ----------
    path:
        Path to the CMS GEM forward mapping file (2018 format).
    """
    path = Path(path)
    mappings = []
    with open(path) as fh:
        for line in fh:
            parts = line.strip().split()
            if len(parts) >= 2:
                mappings.append((parts[0], parts[1], ""))
    logger.info(f"Loaded {len(mappings):,} mappings from {path.name}")
    return cls(mappings=mappings)

map_code ¶

map_code(icd9_code)

Map a single ICD-9 code to its ICD-10 equivalent.

Parameters:

Name	Type	Description	Default
`icd9_code`	`str`	ICD-9-CM code string (with or without decimal point).	required

Returns:

Type	Description
`str or None`	ICD-10-CM code, or `default_value` if not found.

Source code in clinops/preprocess/icd.py

def map_code(self, icd9_code: str) -> str | None:
    """
    Map a single ICD-9 code to its ICD-10 equivalent.

    Parameters
    ----------
    icd9_code:
        ICD-9-CM code string (with or without decimal point).

    Returns
    -------
    str or None
        ICD-10-CM code, or ``default_value`` if not found.
    """
    normalized = self._normalize_code(icd9_code)
    return self._icd9_to_10.get(normalized, self.default_value)

map_series ¶

map_series(series)

Map a Series of ICD-9 codes to ICD-10.

Parameters:

Name	Type	Description	Default
`series`	`Series`	String Series of ICD-9-CM codes.	required

Returns:

Type	Description
`Series`	ICD-10-CM codes. Unmapped codes become `default_value`.

Source code in clinops/preprocess/icd.py

def map_series(self, series: pd.Series) -> pd.Series:
    """
    Map a Series of ICD-9 codes to ICD-10.

    Parameters
    ----------
    series:
        String Series of ICD-9-CM codes.

    Returns
    -------
    pd.Series
        ICD-10-CM codes. Unmapped codes become ``default_value``.
    """
    normalized = series.astype(str).str.replace(".", "", regex=False).str.strip()
    mapped = normalized.map(self._icd9_to_10)
    n_unmapped = mapped.isna().sum()
    if n_unmapped:
        logger.debug(f"ICDMapper: {n_unmapped:,} codes had no ICD-10 mapping")
    return mapped

harmonize ¶

harmonize(
    df,
    code_col,
    version_col,
    icd9_value="9",
    icd10_value="10",
    output_col=None,
)

Harmonize a mixed ICD-9/ICD-10 column to ICD-10 in-place.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame.	required
`code_col`	`str`	Column containing ICD codes.	required
`version_col`	`str`	Column indicating ICD version for each row.	required
`icd9_value`	`str`	Value in `version_col` indicating ICD-9 (default `"9"`).	`'9'`
`icd10_value`	`str`	Value in `version_col` indicating ICD-10 (default `"10"`).	`'10'`
`output_col`	`str \| None`	Column to write harmonized codes to. Defaults to `code_col`.	`None`

Returns:

Type	Description
`DataFrame`

Source code in clinops/preprocess/icd.py

def harmonize(
    self,
    df: pd.DataFrame,
    code_col: str,
    version_col: str,
    icd9_value: str = "9",
    icd10_value: str = "10",
    output_col: str | None = None,
) -> pd.DataFrame:
    """
    Harmonize a mixed ICD-9/ICD-10 column to ICD-10 in-place.

    Parameters
    ----------
    df:
        Input DataFrame.
    code_col:
        Column containing ICD codes.
    version_col:
        Column indicating ICD version for each row.
    icd9_value:
        Value in ``version_col`` indicating ICD-9 (default ``"9"``).
    icd10_value:
        Value in ``version_col`` indicating ICD-10 (default ``"10"``).
    output_col:
        Column to write harmonized codes to. Defaults to ``code_col``.

    Returns
    -------
    pd.DataFrame
    """
    df = df.copy()
    out_col = output_col or code_col

    icd9_mask = df[version_col].astype(str) == str(icd9_value)
    n_icd9 = int(icd9_mask.sum())
    n_icd10 = int((df[version_col].astype(str) == str(icd10_value)).sum())

    logger.info(
        f"ICDMapper.harmonize: {n_icd9:,} ICD-9 rows, {n_icd10:,} ICD-10 rows "
        f"in column '{code_col}'"
    )

    if n_icd9 > 0:
        df.loc[icd9_mask, out_col] = self.map_series(df.loc[icd9_mask, code_col]).values

    return df

chapter ¶

chapter(icd10_code)

Return the ICD-10 chapter description for a code.

Parameters:

Name	Type	Description	Default
`icd10_code`	`str`	ICD-10-CM code string (e.g. `"I2510"`).	required

Returns:

Type	Description
`str`	Chapter description, or `"Unknown"` if code is out of range.

Source code in clinops/preprocess/icd.py

def chapter(self, icd10_code: str) -> str:
    """
    Return the ICD-10 chapter description for a code.

    Parameters
    ----------
    icd10_code:
        ICD-10-CM code string (e.g. ``"I2510"``).

    Returns
    -------
    str
        Chapter description, or ``"Unknown"`` if code is out of range.
    """
    code = icd10_code.strip().upper()
    prefix = re.match(r"[A-Z]\d{2}", code)
    if not prefix:
        return "Unknown"
    code3 = prefix.group(0)

    for start, end, description in _ICD10_CHAPTERS:
        if start <= code3 <= end:
            return description
    return "Unknown"

chapter_series ¶

chapter_series(series)

Map a Series of ICD-10 codes to their chapter descriptions.

Source code in clinops/preprocess/icd.py

def chapter_series(self, series: pd.Series) -> pd.Series:
    """Map a Series of ICD-10 codes to their chapter descriptions."""
    return series.apply(self.chapter)

describe ¶

describe(icd9_code)

Return the description for an ICD-9 code.

Source code in clinops/preprocess/icd.py

def describe(self, icd9_code: str) -> str:
    """Return the description for an ICD-9 code."""
    return self._descriptions.get(self._normalize_code(icd9_code), "No description available")

clinops.preprocess¶

clinops.preprocess.outliers.ClinicalOutlierClipper ¶

fit_transform ¶

report ¶

add_bounds ¶

clinops.preprocess.units.UnitNormalizer ¶

transform ¶

report ¶

available_conversions staticmethod ¶

clinops.preprocess.icd.ICDMapper ¶

n_mappings property ¶

from_gem_file classmethod ¶

map_code ¶

map_series ¶

harmonize ¶

chapter ¶

chapter_series ¶

describe ¶

available_conversions `staticmethod` ¶

n_mappings `property` ¶

from_gem_file `classmethod` ¶