Source code for hpotk.algorithm.similarity._ic

import math
import typing
from collections import Counter

from hpotk.model import TermId
from hpotk.annotations import AnnotatedItemContainer
from hpotk.ontology import MinimalOntology
from hpotk.util import validate_instance
from ._model import AnnotationIcContainer



[docs]
def calculate_ic_for_annotated_items(
    items: AnnotatedItemContainer,
    ontology: MinimalOntology,
    base: typing.Optional[float] = None,
    module_root: typing.Optional[TermId] = None,
    use_pseudocount: bool = False,
) -> AnnotationIcContainer:
    """
    Calculate information content (IC) for each :class:`TermId` based on a collection of annotated `items`.

    The calculation can be done for an ontology module - only the descendants of the provided `module_root`
    will be included in the analysis. If `assume_annotated` is `True`, then the count of all ontology/module terms
    is set to at least 1, even for those terms that do not annotate the `items`.

    :param items: a collection of world items (e.g. diseases).
    :param ontology: ontology with concepts used to annotate the `items` (e.g. Human Phenotype Ontology for diseases).
    :param base: information content base or `None` for *e*
                 (produces IC in `nats <https://en.wikipedia.org/wiki/Nat_(unit)>`_)
    :param module_root: the root of the ontology module to calculate the IC for.
    :param use_pseudocount: assume that each ontology term annotates at least one item.

    :return: a container with mappings from :class:`TermId` to information content in nats, bits, or else,
             depending on the `base` value
    """
    ontology = validate_instance(ontology, MinimalOntology, "ontology")

    graph = ontology.graph
    term_id_count: Counter[TermId] = Counter()
    module_term_ids: typing.Optional[typing.Set[TermId]] = (
        None if module_root is None else set(graph.get_descendants(module_root, include_source=True))
    )

    for item in items:
        for annotation in item.annotations:
            if annotation.is_present:
                if module_root is not None and annotation.identifier not in module_term_ids:
                    # annotation is not from the target module.
                    continue

                for ancestor in graph.get_ancestors(annotation.identifier, include_source=True):
                    if module_term_ids is None:
                        # Not doing module
                        term_id_count[ancestor] += 1
                    elif ancestor in module_term_ids:
                        # Doing module and the ancestor is from the module
                        term_id_count[ancestor] += 1

    if use_pseudocount:
        # Set the count of all primary term IDs to at least one but DO NOT increment the count of the ancestor
        # that already count>=1 .
        # Note, in the HPO case, this will set count of non-phenotypic abnormalities (e.g. Clinical modifier)
        # to 1 as well.
        corpus = map(lambda t: t.identifier, ontology.terms) if module_root is None else module_term_ids

        for term_id in corpus:  # type: ignore - `term_id_count` is never None if `module_root` is not None
            if term_id not in term_id_count:
                term_id_count[term_id] = 1

    log_func = math.log if base is None else lambda c: math.log(c, base)

    population_count = term_id_count[graph.root] if module_root is None else term_id_count[module_root]
    data = {term_id: log_func(population_count / count) for term_id, count in term_id_count.items()}

    metadata = dict()
    if items.version is not None:
        metadata["annotated_items_version"] = items.version
    if ontology.version is not None:
        metadata["ontology_version"] = ontology.version

    return AnnotationIcContainer.from_mapping(data, metadata=metadata)