Skip to content

clinops.ingest

clinops.ingest.mimic_tables.MimicTableLoader

MimicTableLoader(
    mimic_path,
    version="auto",
    strict_validation=True,
    chunk_size=None,
)

Pre-built loader for the MIMIC-IV tables researchers use most.

Wraps :class:~clinops.ingest.MimicLoader and adds:

  • Pre-validated schemas for chartevents, labevents, admissions, diagnoses_icd, and icustays.
  • with_ref_range flag on labevents to retain or drop the reference range columns (noisy on many MIMIC exports).
  • primary_only flag on diagnoses_icd to keep only seq_num == 1 (the principal diagnosis).
  • with_los_band flag on icustays to add a categorical los_band column (<1d, 1-3d, 3-7d, >7d) useful as a stratification variable.
  • A summary() method that prints row counts and null rates for all five tables without loading full data.

Parameters:

Name Type Description Default
mimic_path str | Path

Root directory of the MIMIC-IV dataset.

required
version str

MIMIC-IV version string or "auto" (default).

'auto'
strict_validation bool

Raise on missing required columns when True (default).

True
chunk_size int | None

Pass through to underlying :class:MimicLoader for large tables.

None

Examples:

>>> tbl = MimicTableLoader("/data/mimic-iv-2.2")
>>> charts = tbl.chartevents(subject_ids=[10000032, 10000980])
>>> dx      = tbl.diagnoses_icd(subject_ids=[10000032], primary_only=True)
>>> stays   = tbl.icustays(subject_ids=[10000032], with_los_band=True)
Source code in clinops/ingest/mimic_tables.py
def __init__(
    self,
    mimic_path: str | Path,
    version: str = "auto",
    strict_validation: bool = True,
    chunk_size: int | None = None,
) -> None:
    self._path = Path(mimic_path)
    self._loader = MimicLoader(
        mimic_path=mimic_path,
        version=version,
        strict_validation=strict_validation,
        chunk_size=chunk_size,
    )
    logger.info(f"MimicTableLoader ready — {self._path}")

chartevents

chartevents(
    subject_ids=None,
    hadm_ids=None,
    stay_ids=None,
    item_ids=None,
    start_time=None,
    end_time=None,
)

Load ICU charted observations with schema validation.

Returns a DataFrame with columns: subject_id, hadm_id, stay_id, itemid, charttime, valuenum, valueuom.

Parameters:

Name Type Description Default
subject_ids Sequence[int] | None

Restrict to these patients.

None
hadm_ids Sequence[int] | None

Restrict to these hospital admissions.

None
stay_ids Sequence[int] | None

Restrict to these ICU stays.

None
item_ids Sequence[int] | None

Restrict to these MIMIC itemids (see d_items).

None
start_time str | None

ISO datetime strings for time range filtering.

None
end_time str | None

ISO datetime strings for time range filtering.

None
Source code in clinops/ingest/mimic_tables.py
def chartevents(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    stay_ids: Sequence[int] | None = None,
    item_ids: Sequence[int] | None = None,
    start_time: str | None = None,
    end_time: str | None = None,
) -> pd.DataFrame:
    """
    Load ICU charted observations with schema validation.

    Returns a DataFrame with columns:
    ``subject_id``, ``hadm_id``, ``stay_id``, ``itemid``,
    ``charttime``, ``valuenum``, ``valueuom``.

    Parameters
    ----------
    subject_ids:
        Restrict to these patients.
    hadm_ids:
        Restrict to these hospital admissions.
    stay_ids:
        Restrict to these ICU stays.
    item_ids:
        Restrict to these MIMIC itemids (see ``d_items``).
    start_time, end_time:
        ISO datetime strings for time range filtering.
    """
    df = self._loader.chartevents(
        subject_ids=subject_ids,
        hadm_ids=hadm_ids,
        stay_ids=stay_ids,
        item_ids=item_ids,
        start_time=start_time,
        end_time=end_time,
    )
    df = df.copy()
    df["charttime"] = pd.to_datetime(df["charttime"], errors="coerce")
    _SCHEMAS["chartevents"].validate(df)
    logger.info(f"chartevents: {len(df):,} rows")
    return df

labevents

labevents(
    subject_ids=None,
    hadm_ids=None,
    item_ids=None,
    start_time=None,
    end_time=None,
    with_ref_range=False,
)

Load hospital laboratory results.

Parameters:

Name Type Description Default
with_ref_range bool

If False (default), drop ref_range_lower / ref_range_upper columns — they are sparsely populated in most MIMIC exports and add noise to downstream pipelines. Set True to retain them.

False
Source code in clinops/ingest/mimic_tables.py
def labevents(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    item_ids: Sequence[int] | None = None,
    start_time: str | None = None,
    end_time: str | None = None,
    with_ref_range: bool = False,
) -> pd.DataFrame:
    """
    Load hospital laboratory results.

    Parameters
    ----------
    with_ref_range:
        If ``False`` (default), drop ``ref_range_lower`` /
        ``ref_range_upper`` columns — they are sparsely populated in
        most MIMIC exports and add noise to downstream pipelines.
        Set ``True`` to retain them.
    """
    df = self._loader.labevents(
        subject_ids=subject_ids,
        hadm_ids=hadm_ids,
        item_ids=item_ids,
        start_time=start_time,
        end_time=end_time,
    )
    _SCHEMAS["labevents"].validate(df)
    if not with_ref_range:
        drop_cols = [c for c in ["ref_range_lower", "ref_range_upper"] if c in df.columns]
        df = df.drop(columns=drop_cols)
    logger.info(f"labevents: {len(df):,} rows (with_ref_range={with_ref_range})")
    return df

admissions

admissions(subject_ids=None, hadm_ids=None)

Load hospital admission records.

Returns a DataFrame with columns: subject_id, hadm_id, admittime, dischtime, deathtime, admission_type, admission_location, discharge_location, insurance, hospital_expire_flag.

Source code in clinops/ingest/mimic_tables.py
def admissions(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """
    Load hospital admission records.

    Returns a DataFrame with columns:
    ``subject_id``, ``hadm_id``, ``admittime``, ``dischtime``,
    ``deathtime``, ``admission_type``, ``admission_location``,
    ``discharge_location``, ``insurance``, ``hospital_expire_flag``.
    """
    df = self._loader.admissions(subject_ids=subject_ids, hadm_ids=hadm_ids)
    _SCHEMAS["admissions"].validate(df)
    logger.info(f"admissions: {len(df):,} rows")
    return df

diagnoses_icd

diagnoses_icd(
    subject_ids=None, hadm_ids=None, primary_only=False
)

Load ICD-9/ICD-10 diagnosis codes per hospital admission.

MIMIC-IV mixes ICD-9-CM and ICD-10-CM codes. The icd_version column contains 9 or 10 — use :class:~clinops.preprocess.ICDMapper to harmonize to a single version before modelling.

Parameters:

Name Type Description Default
primary_only bool

If True, keep only rows where seq_num == 1 (the principal/primary diagnosis per admission). Default False returns all coded diagnoses.

False
Source code in clinops/ingest/mimic_tables.py
def diagnoses_icd(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    primary_only: bool = False,
) -> pd.DataFrame:
    """
    Load ICD-9/ICD-10 diagnosis codes per hospital admission.

    MIMIC-IV mixes ICD-9-CM and ICD-10-CM codes.  The ``icd_version``
    column contains ``9`` or ``10`` — use
    :class:`~clinops.preprocess.ICDMapper` to harmonize to a single
    version before modelling.

    Parameters
    ----------
    primary_only:
        If ``True``, keep only rows where ``seq_num == 1`` (the
        principal/primary diagnosis per admission).  Default ``False``
        returns all coded diagnoses.
    """
    df = self._load_extra_table("diagnoses_icd")
    if subject_ids is not None:
        df = df[df["subject_id"].isin(subject_ids)]
    if hadm_ids is not None:
        df = df[df["hadm_id"].isin(hadm_ids)]
    if primary_only:
        df = df[df["seq_num"] == 1]
    _SCHEMAS["diagnoses_icd"].validate(df)
    logger.info(f"diagnoses_icd: {len(df):,} rows (primary_only={primary_only})")
    return df.reset_index(drop=True)

icustays

icustays(
    subject_ids=None,
    hadm_ids=None,
    stay_ids=None,
    with_los_band=False,
)

Load ICU stay metadata.

Parameters:

Name Type Description Default
with_los_band bool

If True, add a los_band categorical column bucketing length-of-stay into <1d, 1-3d, 3-7d, >7d. Useful as a stratification variable for cohort splits.

False
Source code in clinops/ingest/mimic_tables.py
def icustays(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    stay_ids: Sequence[int] | None = None,
    with_los_band: bool = False,
) -> pd.DataFrame:
    """
    Load ICU stay metadata.

    Parameters
    ----------
    with_los_band:
        If ``True``, add a ``los_band`` categorical column bucketing
        length-of-stay into ``<1d``, ``1-3d``, ``3-7d``, ``>7d``.
        Useful as a stratification variable for cohort splits.
    """
    df = self._loader.icustays(
        subject_ids=subject_ids,
        hadm_ids=hadm_ids,
        stay_ids=stay_ids,
    )
    _SCHEMAS["icustays"].validate(df)
    if with_los_band and "los" in df.columns:
        df = df.copy()
        df["los_band"] = pd.cut(
            df["los"],
            bins=[0, 1, 3, 7, float("inf")],
            labels=["<1d", "1-3d", "3-7d", ">7d"],
            right=True,
        )
    logger.info(f"icustays: {len(df):,} rows (with_los_band={with_los_band})")
    return df

summary

summary()

Print a quick-look table of row counts and null rates for all five tables without loading the full data into memory.

Uses pd.read_csv with nrows=0 to read headers, then scans only the first 10,000 rows to estimate null rates.

Returns:

Type Description
DataFrame

Columns: table, rows_sampled, columns, null_rate_pct

Source code in clinops/ingest/mimic_tables.py
def summary(self) -> pd.DataFrame:
    """
    Print a quick-look table of row counts and null rates for all five
    tables without loading the full data into memory.

    Uses ``pd.read_csv`` with ``nrows=0`` to read headers, then scans
    only the first 10,000 rows to estimate null rates.

    Returns
    -------
    pd.DataFrame
        Columns: ``table``, ``rows_sampled``, ``columns``, ``null_rate_pct``
    """
    records = []
    for table_name in ["chartevents", "labevents", "admissions", "diagnoses_icd", "icustays"]:
        try:
            path = self._resolve_extra_path(table_name)
            sample = pd.read_csv(path, nrows=10_000, low_memory=False)
            null_rate = sample.isnull().mean().mean() * 100
            records.append(
                {
                    "table": table_name,
                    "rows_sampled": len(sample),
                    "columns": len(sample.columns),
                    "null_rate_pct": round(null_rate, 2),
                }
            )
        except FileNotFoundError:
            records.append(
                {
                    "table": table_name,
                    "rows_sampled": 0,
                    "columns": 0,
                    "null_rate_pct": None,
                }
            )
    result = pd.DataFrame(records)
    print(result.to_string(index=False))
    return result

clinops.ingest.mimic.MimicLoader

MimicLoader(
    mimic_path,
    version="auto",
    strict_validation=True,
    chunk_size=None,
)

Loader for MIMIC-IV clinical database tables.

Parameters:

Name Type Description Default
mimic_path str | Path

Root directory of the MIMIC-IV dataset. Should contain hosp/ and icu/ subdirectories.

required
version str

MIMIC-IV version string (e.g. "2.2"). Use "auto" to detect from the directory structure.

'auto'
strict_validation bool

If True (default), raise SchemaValidationError when required columns are missing. If False, log a warning and continue.

True
chunk_size int | None

If set, return a pd.io.parsers.TextFileReader for large tables instead of loading the full table into memory.

None

Examples:

>>> loader = MimicLoader("/data/mimic-iv-2.2")
>>> charts = loader.chartevents(subject_ids=[10000032])
>>> labs   = loader.labevents(hadm_ids=[20000019])
Source code in clinops/ingest/mimic.py
def __init__(
    self,
    mimic_path: str | Path,
    version: str = "auto",
    strict_validation: bool = True,
    chunk_size: int | None = None,
) -> None:
    self._cfg = MimicLoaderConfig(
        mimic_path=Path(mimic_path),
        version=version,
        strict_validation=strict_validation,
        chunk_size=chunk_size,
    )
    self._version = self._detect_version() if version == "auto" else version
    logger.info(
        f"MimicLoader initialised — path={self._cfg.mimic_path}, version={self._version}"
    )

chartevents

chartevents(
    subject_ids=None,
    hadm_ids=None,
    stay_ids=None,
    item_ids=None,
    start_time=None,
    end_time=None,
)

Load ICU charted observations (vitals, GCS, ventilator settings, etc.).

Parameters:

Name Type Description Default
subject_ids Sequence[int] | None

Filter to these patients.

None
hadm_ids Sequence[int] | None

Filter to these hospital admissions.

None
stay_ids Sequence[int] | None

Filter to these ICU stays.

None
item_ids Sequence[int] | None

Filter to these MIMIC-IV itemids (see d_items).

None
start_time str | None

ISO datetime string — exclude rows before this time.

None
end_time str | None

ISO datetime string — exclude rows after this time.

None
Source code in clinops/ingest/mimic.py
def chartevents(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    stay_ids: Sequence[int] | None = None,
    item_ids: Sequence[int] | None = None,
    start_time: str | None = None,
    end_time: str | None = None,
) -> pd.DataFrame:
    """
    Load ICU charted observations (vitals, GCS, ventilator settings, etc.).

    Parameters
    ----------
    subject_ids:
        Filter to these patients.
    hadm_ids:
        Filter to these hospital admissions.
    stay_ids:
        Filter to these ICU stays.
    item_ids:
        Filter to these MIMIC-IV itemids (see d_items).
    start_time:
        ISO datetime string — exclude rows before this time.
    end_time:
        ISO datetime string — exclude rows after this time.
    """
    df = self._load_table("chartevents")
    return self._filter(df, subject_ids, hadm_ids, stay_ids, item_ids, start_time, end_time)

labevents

labevents(
    subject_ids=None,
    hadm_ids=None,
    item_ids=None,
    start_time=None,
    end_time=None,
)

Load hospital laboratory results.

Source code in clinops/ingest/mimic.py
def labevents(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    item_ids: Sequence[int] | None = None,
    start_time: str | None = None,
    end_time: str | None = None,
) -> pd.DataFrame:
    """Load hospital laboratory results."""
    df = self._load_table("labevents")
    return self._filter(df, subject_ids, hadm_ids, None, item_ids, start_time, end_time)

admissions

admissions(subject_ids=None, hadm_ids=None)

Load hospital admission records.

Source code in clinops/ingest/mimic.py
def admissions(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """Load hospital admission records."""
    df = self._load_table("admissions")
    return self._filter(df, subject_ids, hadm_ids)

patients

patients(subject_ids=None)

Load patient demographics.

Source code in clinops/ingest/mimic.py
def patients(
    self,
    subject_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """Load patient demographics."""
    df = self._load_table("patients")
    return self._filter(df, subject_ids)

icustays

icustays(subject_ids=None, hadm_ids=None, stay_ids=None)

Load ICU stay metadata including LOS.

Source code in clinops/ingest/mimic.py
def icustays(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    stay_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """Load ICU stay metadata including LOS."""
    df = self._load_table("icustays")
    return self._filter(df, subject_ids, hadm_ids, stay_ids)

prescriptions

prescriptions(subject_ids=None, hadm_ids=None, drugs=None)

Load medication prescriptions.

Source code in clinops/ingest/mimic.py
def prescriptions(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    drugs: Sequence[str] | None = None,
) -> pd.DataFrame:
    """Load medication prescriptions."""
    df = self._load_table("prescriptions")
    df = self._filter(df, subject_ids, hadm_ids)
    if drugs:
        df = df[df["drug"].str.lower().isin([d.lower() for d in drugs])]
    return df

inputevents

inputevents(subject_ids=None, stay_ids=None)

Load ICU fluid input events.

Source code in clinops/ingest/mimic.py
def inputevents(
    self,
    subject_ids: Sequence[int] | None = None,
    stay_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """Load ICU fluid input events."""
    df = self._load_table("inputevents")
    return self._filter(df, subject_ids, None, stay_ids)

d_items

d_items()

Load ICU item dictionary (maps itemid → label).

Source code in clinops/ingest/mimic.py
def d_items(self) -> pd.DataFrame:
    """Load ICU item dictionary (maps itemid → label)."""
    return self._load_table("d_items")

d_labitems

d_labitems()

Load lab item dictionary (maps itemid → label).

Source code in clinops/ingest/mimic.py
def d_labitems(self) -> pd.DataFrame:
    """Load lab item dictionary (maps itemid → label)."""
    return self._load_table("d_labitems")

clinops.ingest.mimic_iii.MimicIIILoader

MimicIIILoader(
    mimic_path, strict_validation=True, chunk_size=None
)

Loader for the MIMIC-III Clinical Database.

Provides the same filtering interface as MimicLoader (MIMIC-IV) so that analysis code can be reused across both datasets with minimal changes.

Key differences from MIMIC-IV: - Flat directory — no hosp/ / icu/ split. - Uppercase column names in source files — normalised to lowercase on load. - ICU stay key is icustay_id (not stay_id). - ICD-9-CM only — diagnoses_icd has icd9_code, not icd_code. - inputevents is split into inputevents_mv (MetaVision) and inputevents_cv (CareVue); use :meth:inputevents to get both merged.

Parameters:

Name Type Description Default
mimic_path str | Path

Root directory of the MIMIC-III dataset. Should contain files like CHARTEVENTS.csv.gz, ADMISSIONS.csv.gz, etc.

required
strict_validation bool

If True (default), raise :exc:SchemaValidationError when required columns are missing. If False, log a warning and continue.

True
chunk_size int | None

If set, large tables (chartevents, labevents) return a chunked reader instead of loading fully into memory.

None

Examples:

>>> loader = MimicIIILoader("/data/mimic-iii-clinical-database-1.4")
>>> charts = loader.chartevents(subject_ids=[40124])
>>> labs   = loader.labevents(hadm_ids=[198765])
>>> dx     = loader.diagnoses_icd(subject_ids=[40124], primary_only=True)
Source code in clinops/ingest/mimic_iii.py
def __init__(
    self,
    mimic_path: str | Path,
    strict_validation: bool = True,
    chunk_size: int | None = None,
) -> None:
    self._cfg = MimicIIIConfig(
        mimic_path=Path(mimic_path),
        strict_validation=strict_validation,
        chunk_size=chunk_size,
    )
    logger.info(f"MimicIIILoader initialised — path={self._cfg.mimic_path}")

chartevents

chartevents(
    subject_ids=None,
    hadm_ids=None,
    icustay_ids=None,
    item_ids=None,
    start_time=None,
    end_time=None,
)

Load ICU charted observations.

Parameters:

Name Type Description Default
subject_ids Sequence[int] | None

Restrict to these patient IDs.

None
hadm_ids Sequence[int] | None

Restrict to these hospital admission IDs.

None
icustay_ids Sequence[int] | None

Restrict to these ICU stay IDs (icustay_id in MIMIC-III, equivalent to stay_id in MIMIC-IV).

None
item_ids Sequence[int] | None

Restrict to these itemid values. See :meth:d_items.

None
start_time str | None

Exclude rows with charttime before this ISO datetime string.

None
end_time str | None

Exclude rows with charttime after this ISO datetime string.

None

Returns:

Type Description
DataFrame

Columns (lowercase): subject_id, hadm_id, icustay_id, itemid, charttime (datetime), valuenum, valueuom.

Source code in clinops/ingest/mimic_iii.py
def chartevents(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    icustay_ids: Sequence[int] | None = None,
    item_ids: Sequence[int] | None = None,
    start_time: str | None = None,
    end_time: str | None = None,
) -> pd.DataFrame:
    """
    Load ICU charted observations.

    Parameters
    ----------
    subject_ids:
        Restrict to these patient IDs.
    hadm_ids:
        Restrict to these hospital admission IDs.
    icustay_ids:
        Restrict to these ICU stay IDs (``icustay_id`` in MIMIC-III,
        equivalent to ``stay_id`` in MIMIC-IV).
    item_ids:
        Restrict to these ``itemid`` values. See :meth:`d_items`.
    start_time:
        Exclude rows with ``charttime`` before this ISO datetime string.
    end_time:
        Exclude rows with ``charttime`` after this ISO datetime string.

    Returns
    -------
    pd.DataFrame
        Columns (lowercase): ``subject_id``, ``hadm_id``,
        ``icustay_id``, ``itemid``, ``charttime`` (datetime),
        ``valuenum``, ``valueuom``.
    """
    df = self._load("chartevents")
    return self._filter(
        df,
        subject_ids,
        hadm_ids,
        icustay_ids=icustay_ids,
        item_ids=item_ids,
        start_time=start_time,
        end_time=end_time,
        time_col="charttime",
    )

labevents

labevents(
    subject_ids=None,
    hadm_ids=None,
    item_ids=None,
    start_time=None,
    end_time=None,
    with_ref_range=False,
)

Load hospital laboratory results.

Parameters:

Name Type Description Default
subject_ids Sequence[int] | None

Standard filters — see :meth:chartevents.

None
hadm_ids Sequence[int] | None

Standard filters — see :meth:chartevents.

None
item_ids Sequence[int] | None

Standard filters — see :meth:chartevents.

None
start_time Sequence[int] | None

Standard filters — see :meth:chartevents.

None
end_time Sequence[int] | None

Standard filters — see :meth:chartevents.

None
with_ref_range bool

If True, retain ref_range_lower and ref_range_upper. Dropped by default to reduce memory footprint.

False

Returns:

Type Description
DataFrame

Columns: subject_id, hadm_id, itemid, charttime (datetime), valuenum, valueuom.

Source code in clinops/ingest/mimic_iii.py
def labevents(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    item_ids: Sequence[int] | None = None,
    start_time: str | None = None,
    end_time: str | None = None,
    with_ref_range: bool = False,
) -> pd.DataFrame:
    """
    Load hospital laboratory results.

    Parameters
    ----------
    subject_ids, hadm_ids, item_ids, start_time, end_time:
        Standard filters — see :meth:`chartevents`.
    with_ref_range:
        If ``True``, retain ``ref_range_lower`` and ``ref_range_upper``.
        Dropped by default to reduce memory footprint.

    Returns
    -------
    pd.DataFrame
        Columns: ``subject_id``, ``hadm_id``, ``itemid``,
        ``charttime`` (datetime), ``valuenum``, ``valueuom``.
    """
    df = self._load("labevents")
    df = self._filter(
        df,
        subject_ids,
        hadm_ids,
        item_ids=item_ids,
        start_time=start_time,
        end_time=end_time,
        time_col="charttime",
    )
    if not with_ref_range:
        drop = [c for c in ["ref_range_lower", "ref_range_upper"] if c in df.columns]
        df = df.drop(columns=drop)
    return df

admissions

admissions(subject_ids=None, hadm_ids=None)

Load hospital admission and discharge records.

Returns:

Type Description
DataFrame

Columns: subject_id, hadm_id, admittime, dischtime, deathtime, admission_type, and others.

Source code in clinops/ingest/mimic_iii.py
def admissions(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """
    Load hospital admission and discharge records.

    Returns
    -------
    pd.DataFrame
        Columns: ``subject_id``, ``hadm_id``, ``admittime``,
        ``dischtime``, ``deathtime``, ``admission_type``, and others.
    """
    df = self._load("admissions")
    return self._filter(df, subject_ids, hadm_ids)

diagnoses_icd

diagnoses_icd(
    subject_ids=None,
    hadm_ids=None,
    icd9_codes=None,
    primary_only=False,
)

Load ICD-9-CM diagnosis codes.

MIMIC-III uses ICD-9-CM exclusively. The column name is icd9_code (not icd_code as in MIMIC-IV). For cross-dataset compatibility, a synthetic icd_version column (always 9) is added on load.

Parameters:

Name Type Description Default
subject_ids Sequence[int] | None

Standard filters.

None
hadm_ids Sequence[int] | None

Standard filters.

None
icd9_codes Sequence[str] | None

Restrict to these ICD-9-CM codes (exact match, case-insensitive).

None
primary_only bool

If True, return only the primary diagnosis (seq_num == 1) per admission.

False

Returns:

Type Description
DataFrame

Columns: subject_id, hadm_id, seq_num, icd9_code, icd_version (always 9).

Source code in clinops/ingest/mimic_iii.py
def diagnoses_icd(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    icd9_codes: Sequence[str] | None = None,
    primary_only: bool = False,
) -> pd.DataFrame:
    """
    Load ICD-9-CM diagnosis codes.

    MIMIC-III uses ICD-9-CM exclusively. The column name is ``icd9_code``
    (not ``icd_code`` as in MIMIC-IV). For cross-dataset compatibility,
    a synthetic ``icd_version`` column (always 9) is added on load.

    Parameters
    ----------
    subject_ids, hadm_ids:
        Standard filters.
    icd9_codes:
        Restrict to these ICD-9-CM codes (exact match,
        case-insensitive).
    primary_only:
        If ``True``, return only the primary diagnosis (``seq_num == 1``)
        per admission.

    Returns
    -------
    pd.DataFrame
        Columns: ``subject_id``, ``hadm_id``, ``seq_num``,
        ``icd9_code``, ``icd_version`` (always 9).
    """
    df = self._load("diagnoses_icd")
    df = self._filter(df, subject_ids, hadm_ids)

    if icd9_codes is not None:
        if "icd9_code" in df.columns:
            df = df[df["icd9_code"].str.upper().isin([c.upper() for c in icd9_codes])]
        else:
            logger.warning(
                "Column 'icd9_code' missing from diagnoses_icd; skipping icd9_codes filter."
            )

    if primary_only:
        if "seq_num" in df.columns:
            df = df[df["seq_num"] == 1]
        else:
            logger.warning(
                "Column 'seq_num' missing from diagnoses_icd; "
                "cannot restrict to primary diagnoses."
            )

    # Add synthetic icd_version for cross-dataset compatibility
    if "icd_version" not in df.columns:
        df = df.assign(icd_version=9)

    return df.reset_index(drop=True)

icustays

icustays(subject_ids=None, hadm_ids=None, icustay_ids=None)

Load ICU stay metadata including length of stay.

Note: The ICU stay key in MIMIC-III is icustay_id, not stay_id as in MIMIC-IV.

Returns:

Type Description
DataFrame

Columns: subject_id, hadm_id, icustay_id, first_careunit, last_careunit, intime (datetime), outtime (datetime), los (days, float).

Source code in clinops/ingest/mimic_iii.py
def icustays(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    icustay_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """
    Load ICU stay metadata including length of stay.

    Note: The ICU stay key in MIMIC-III is ``icustay_id``, not
    ``stay_id`` as in MIMIC-IV.

    Returns
    -------
    pd.DataFrame
        Columns: ``subject_id``, ``hadm_id``, ``icustay_id``,
        ``first_careunit``, ``last_careunit``, ``intime`` (datetime),
        ``outtime`` (datetime), ``los`` (days, float).
    """
    df = self._load("icustays")
    return self._filter(df, subject_ids, hadm_ids, icustay_ids=icustay_ids)

prescriptions

prescriptions(subject_ids=None, hadm_ids=None, drugs=None)

Load medication prescriptions.

Parameters:

Name Type Description Default
drugs Sequence[str] | None

Restrict to these drug names (case-insensitive substring match).

None

Returns:

Type Description
DataFrame

Columns: subject_id, hadm_id, startdate, enddate, drug, dose_val_rx, dose_unit_rx.

Source code in clinops/ingest/mimic_iii.py
def prescriptions(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    drugs: Sequence[str] | None = None,
) -> pd.DataFrame:
    """
    Load medication prescriptions.

    Parameters
    ----------
    drugs:
        Restrict to these drug names (case-insensitive substring match).

    Returns
    -------
    pd.DataFrame
        Columns: ``subject_id``, ``hadm_id``, ``startdate``,
        ``enddate``, ``drug``, ``dose_val_rx``, ``dose_unit_rx``.
    """
    df = self._load("prescriptions")
    df = self._filter(df, subject_ids, hadm_ids)
    if drugs is not None:
        pattern = "|".join(drugs)
        df = df[df["drug"].str.contains(pattern, case=False, na=False)]
    return df

inputevents

inputevents(
    subject_ids=None,
    hadm_ids=None,
    icustay_ids=None,
    source="mv",
)

Load ICU fluid input events.

MIMIC-III stores MetaVision (INPUTEVENTS_MV) and CareVue (INPUTEVENTS_CV) inputs in separate tables.

Parameters:

Name Type Description Default
source str

"mv" (MetaVision, default), "cv" (CareVue), or "both" to load and concatenate both tables.

'mv'

Returns:

Type Description
DataFrame

For MetaVision: subject_id, hadm_id, icustay_id, itemid, starttime, amount, amountuom. When source="both", also includes event_time (unified timestamp) and source ("mv" or "cv").

Source code in clinops/ingest/mimic_iii.py
def inputevents(
    self,
    subject_ids: Sequence[int] | None = None,
    hadm_ids: Sequence[int] | None = None,
    icustay_ids: Sequence[int] | None = None,
    source: str = "mv",
) -> pd.DataFrame:
    """
    Load ICU fluid input events.

    MIMIC-III stores MetaVision (``INPUTEVENTS_MV``) and CareVue
    (``INPUTEVENTS_CV``) inputs in separate tables.

    Parameters
    ----------
    source:
        ``"mv"`` (MetaVision, default), ``"cv"`` (CareVue), or
        ``"both"`` to load and concatenate both tables.

    Returns
    -------
    pd.DataFrame
        For MetaVision: ``subject_id``, ``hadm_id``, ``icustay_id``,
        ``itemid``, ``starttime``, ``amount``, ``amountuom``.
        When ``source="both"``, also includes ``event_time`` (unified
        timestamp) and ``source`` (``"mv"`` or ``"cv"``).
    """
    if source not in {"mv", "cv", "both"}:
        raise ValueError(f"source must be 'mv', 'cv', or 'both' — got {source!r}")

    dfs: list[pd.DataFrame] = []
    if source in {"mv", "both"}:
        df_mv = self._load("inputevents_mv")
        dfs.append(
            self._filter(
                df_mv,
                subject_ids,
                hadm_ids,
                icustay_ids=icustay_ids,
                time_col="starttime",
            )
        )
    if source in {"cv", "both"}:
        df_cv = self._load("inputevents_cv")
        dfs.append(
            self._filter(
                df_cv,
                subject_ids,
                hadm_ids,
                icustay_ids=icustay_ids,
                time_col="charttime",
            )
        )

    # When combining MetaVision and CareVue input events, normalise to a
    # common timestamp column and record the source system so callers have
    # a consistent schema regardless of the underlying table.
    if len(dfs) > 1:
        normalised: list[pd.DataFrame] = []
        for df in dfs:
            df_norm = df.copy()
            if "starttime" in df_norm.columns:
                df_norm["event_time"] = df_norm["starttime"]
                df_norm["source"] = "mv"
            elif "charttime" in df_norm.columns:
                df_norm["event_time"] = df_norm["charttime"]
                df_norm["source"] = "cv"
            else:
                # Fallback: no recognised time column; keep schema
                # consistent but leave event_time/source missing.
                df_norm["event_time"] = pd.NaT
                df_norm["source"] = pd.NA
            normalised.append(df_norm)
        result = pd.concat(normalised, ignore_index=True)
    else:
        result = dfs[0]
    return result

patients

patients(subject_ids=None)

Load patient demographics.

Returns:

Type Description
DataFrame

Columns: subject_id, gender, dob, dod, dod_hosp, dod_ssn, expire_flag.

Source code in clinops/ingest/mimic_iii.py
def patients(
    self,
    subject_ids: Sequence[int] | None = None,
) -> pd.DataFrame:
    """
    Load patient demographics.

    Returns
    -------
    pd.DataFrame
        Columns: ``subject_id``, ``gender``, ``dob``, ``dod``,
        ``dod_hosp``, ``dod_ssn``, ``expire_flag``.
    """
    df = self._load("patients")
    return self._filter(df, subject_ids)

d_items

d_items()

Load the item dictionary (itemid → label).

Use this to resolve itemid values in :meth:chartevents.

Returns:

Type Description
DataFrame

Columns: itemid, label, abbreviation, dbsource, linksto, category, unitname.

Source code in clinops/ingest/mimic_iii.py
def d_items(self) -> pd.DataFrame:
    """
    Load the item dictionary (``itemid`` → label).

    Use this to resolve ``itemid`` values in :meth:`chartevents`.

    Returns
    -------
    pd.DataFrame
        Columns: ``itemid``, ``label``, ``abbreviation``,
        ``dbsource``, ``linksto``, ``category``, ``unitname``.
    """
    return self._load("d_items")

d_labitems

d_labitems()

Load the lab item dictionary (itemid → label).

Use this to resolve itemid values in :meth:labevents.

Returns:

Type Description
DataFrame

Columns: itemid, label, fluid, category, loinc_code.

Source code in clinops/ingest/mimic_iii.py
def d_labitems(self) -> pd.DataFrame:
    """
    Load the lab item dictionary (``itemid`` → label).

    Use this to resolve ``itemid`` values in :meth:`labevents`.

    Returns
    -------
    pd.DataFrame
        Columns: ``itemid``, ``label``, ``fluid``, ``category``,
        ``loinc_code``.
    """
    return self._load("d_labitems")

clinops.ingest.fhir.FHIRLoader

FHIRLoader(source)

Load FHIR R4 resources from JSON bundles or NDJSON exports.

Parameters:

Name Type Description Default
source str | Path

Path to a FHIR JSON Bundle file, an NDJSON file, or a directory of JSON resource files.

required

Examples:

>>> loader = FHIRLoader("/data/fhir_export")
>>> observations = loader.observations()
>>> patients     = loader.patients()
Source code in clinops/ingest/fhir.py
def __init__(self, source: str | Path) -> None:
    self._source = Path(source)
    if not self._source.exists():
        raise FileNotFoundError(f"FHIR source not found: {self._source}")
    logger.info(f"FHIRLoader initialised — source={self._source}")

patients

patients()

Load Patient resources → DataFrame with demographics.

Source code in clinops/ingest/fhir.py
def patients(self) -> pd.DataFrame:
    """Load Patient resources → DataFrame with demographics."""
    records = self._load_resources("Patient")
    rows = []
    for r in records:
        rows.append(
            {
                "patient_id": r.get("id"),
                "gender": r.get("gender"),
                "birth_date": r.get("birthDate"),
                "deceased": r.get("deceasedBoolean", False),
            }
        )
    return pd.DataFrame(rows)

observations

observations(category=None, loinc_codes=None)

Load Observation resources → long-format DataFrame.

Parameters:

Name Type Description Default
category str | None

Filter to a FHIR observation category (e.g. "vital-signs", "laboratory").

None
loinc_codes list[str] | None

Filter to specific LOINC codes.

None
Source code in clinops/ingest/fhir.py
def observations(
    self,
    category: str | None = None,
    loinc_codes: list[str] | None = None,
) -> pd.DataFrame:
    """
    Load Observation resources → long-format DataFrame.

    Parameters
    ----------
    category:
        Filter to a FHIR observation category (e.g. "vital-signs", "laboratory").
    loinc_codes:
        Filter to specific LOINC codes.
    """
    records = self._load_resources("Observation")
    rows = []
    for r in records:
        code_obj = r.get("code", {})
        codings = code_obj.get("coding", [])
        loinc = next(
            (c["code"] for c in codings if "loinc" in c.get("system", "").lower()), None
        )
        value = r.get("valueQuantity", {})
        rows.append(
            {
                "observation_id": r.get("id"),
                "patient_id": r.get("subject", {}).get("reference", "").split("/")[-1],
                "loinc_code": loinc,
                "display": code_obj.get("text"),
                "value": value.get("value"),
                "unit": value.get("unit"),
                "effective_time": r.get("effectiveDateTime"),
                "status": r.get("status"),
            }
        )
    df = pd.DataFrame(rows)
    if loinc_codes:
        df = df[df["loinc_code"].isin(loinc_codes)]
    return df

conditions

conditions()

Load Condition resources → DataFrame with ICD/SNOMED codes.

Source code in clinops/ingest/fhir.py
def conditions(self) -> pd.DataFrame:
    """Load Condition resources → DataFrame with ICD/SNOMED codes."""
    records = self._load_resources("Condition")
    rows = []
    for r in records:
        codings = r.get("code", {}).get("coding", [])
        rows.append(
            {
                "condition_id": r.get("id"),
                "patient_id": r.get("subject", {}).get("reference", "").split("/")[-1],
                "code": codings[0].get("code") if codings else None,
                "system": codings[0].get("system") if codings else None,
                "display": codings[0].get("display") if codings else None,
                "clinical_status": r.get("clinicalStatus", {})
                .get("coding", [{}])[0]
                .get("code"),
                "onset": r.get("onsetDateTime"),
            }
        )
    return pd.DataFrame(rows)

clinops.ingest.flat.FlatFileLoader

FlatFileLoader(
    path,
    schema=None,
    id_col=None,
    datetime_cols=None,
    strict=True,
)

Load clinical data from CSV or Parquet flat files with validation.

Parameters:

Name Type Description Default
path str | Path

Path to a CSV (.csv, .csv.gz) or Parquet (.parquet, .pq) file.

required
schema ClinicalSchema | None

Optional ClinicalSchema for validation after loading.

None
id_col str | None

Name of the patient/subject identifier column. Used for deduplication reporting.

None
datetime_cols list[str] | None

Column names to parse as datetimes. If None, auto-detection is attempted for columns with "time", "date", or "dt" in the name.

None
strict bool

If True, raise on schema violations. If False, warn and continue.

True

Examples:

>>> loader = FlatFileLoader("vitals_export.csv", id_col="patient_id")
>>> df = loader.load()
>>> print(loader.summary())
Source code in clinops/ingest/flat.py
def __init__(
    self,
    path: str | Path,
    schema: ClinicalSchema | None = None,
    id_col: str | None = None,
    datetime_cols: list[str] | None = None,
    strict: bool = True,
) -> None:
    self._path = Path(path)
    self._schema = schema
    self._id_col = id_col
    self._datetime_cols = datetime_cols
    self._strict = strict
    self._loaded_df: pd.DataFrame | None = None

    if not self._path.exists():
        raise FileNotFoundError(f"File not found: {self._path}")

load

load()

Load the file, apply cleaning and validation, return DataFrame.

Returns:

Type Description
DataFrame
Source code in clinops/ingest/flat.py
def load(self) -> pd.DataFrame:
    """
    Load the file, apply cleaning and validation, return DataFrame.

    Returns
    -------
    pd.DataFrame
    """
    logger.info(f"Loading flat file: {self._path}")
    df = self._read_file()
    df = self._clean(df)
    df = self._parse_datetimes(df)

    if self._schema:
        violations = self._schema.validate(df, strict=self._strict)
        if violations and not self._strict:
            for v in violations:
                logger.warning(v)

    self._loaded_df = df
    logger.info(f"Loaded {len(df):,} rows × {len(df.columns)} cols from {self._path.name}")
    return df

summary

summary()

Return a human-readable summary of the loaded DataFrame.

Source code in clinops/ingest/flat.py
def summary(self) -> str:
    """Return a human-readable summary of the loaded DataFrame."""
    if self._loaded_df is None:
        return "No data loaded yet. Call .load() first."
    df = self._loaded_df
    null_counts = df.isna().sum()
    null_summary = null_counts[null_counts > 0].to_dict()
    lines = [
        f"File:        {self._path.name}",
        f"Rows:        {len(df):,}",
        f"Columns:     {len(df.columns)}",
        f"Dtypes:      {df.dtypes.value_counts().to_dict()}",
        f"Null cols:   {null_summary if null_summary else 'None'}",
    ]
    if self._id_col and self._id_col in df.columns:
        lines.append(f"Unique IDs:  {df[self._id_col].nunique():,}")
    return "\n".join(lines)

clinops.ingest.schema.ClinicalSchema dataclass

ClinicalSchema(
    name, columns=list(), allow_extra_columns=True
)

Declarative schema for a clinical data table.

Parameters:

Name Type Description Default
name str

Human-readable name for this schema (used in error messages).

required
columns list[ColumnSpec]

List of ColumnSpec objects describing required and optional columns.

list()
allow_extra_columns bool

If True (default), columns not in the spec are silently allowed.

True
Example

schema = ClinicalSchema( ... name="vitals", ... columns=[ ... ColumnSpec("subject_id", dtype="int64", nullable=False), ... ColumnSpec("heart_rate", dtype="float64", min_value=0, max_value=300), ... ] ... ) schema.validate(df)

validate

validate(df, strict=True)

Validate a DataFrame against this schema.

Parameters:

Name Type Description Default
df DataFrame

DataFrame to validate.

required
strict bool

If True, raise SchemaValidationError on the first violation. If False, collect all violations and return them as a list.

True

Returns:

Type Description
list[str]

Empty list if valid; list of violation messages otherwise.

Source code in clinops/ingest/schema.py
def validate(self, df: pd.DataFrame, strict: bool = True) -> list[str]:
    """
    Validate a DataFrame against this schema.

    Parameters
    ----------
    df:
        DataFrame to validate.
    strict:
        If True, raise SchemaValidationError on the first violation.
        If False, collect all violations and return them as a list.

    Returns
    -------
    list[str]
        Empty list if valid; list of violation messages otherwise.
    """
    violations: list[str] = []

    for spec in self.columns:
        if spec.name not in df.columns:
            msg = f"[{self.name}] Missing required column: '{spec.name}'"
            violations.append(msg)
            if strict:
                raise SchemaValidationError(msg)
            continue

        col = df[spec.name]

        # Nullability check
        if not spec.nullable and col.isna().any():
            null_count = col.isna().sum()
            msg = (
                f"[{self.name}] Column '{spec.name}' has "
                f"{null_count} null values (nullable=False)"
            )
            violations.append(msg)
            if strict:
                raise SchemaValidationError(msg)

        # Range checks (numeric only)
        if spec.min_value is not None and pd.api.types.is_numeric_dtype(col):
            out_of_range = (col < spec.min_value).sum()
            if out_of_range:
                msg = (
                    f"[{self.name}] Column '{spec.name}' has {out_of_range} values "
                    f"below min_value={spec.min_value}"
                )
                violations.append(msg)
                if strict:
                    raise SchemaValidationError(msg)

        if spec.max_value is not None and pd.api.types.is_numeric_dtype(col):
            out_of_range = (col > spec.max_value).sum()
            if out_of_range:
                msg = (
                    f"[{self.name}] Column '{spec.name}' has {out_of_range} values "
                    f"above max_value={spec.max_value}"
                )
                violations.append(msg)
                if strict:
                    raise SchemaValidationError(msg)

        # Allowed values check
        if spec.allowed_values:
            invalid = ~col.isin(spec.allowed_values)
            invalid_count = invalid.sum()
            if invalid_count:
                msg = (
                    f"[{self.name}] Column '{spec.name}' has {invalid_count} values "
                    f"not in allowed_values={spec.allowed_values}"
                )
                violations.append(msg)
                if strict:
                    raise SchemaValidationError(msg)

    return violations

clinops.ingest.schema.ColumnSpec dataclass

ColumnSpec(
    name,
    dtype=None,
    nullable=True,
    min_value=None,
    max_value=None,
    allowed_values=list(),
)

Specification for a single column in a clinical table.

clinops.ingest.schema.SchemaValidationError

Bases: ValueError

Raised when a loaded table does not match the expected schema.