Source code for hpotk.ontology.load.obographs._load

import re
import json
import typing
import logging

from hpotk.model import TermId, MinimalTerm, Term
from hpotk.graph import OntologyGraph
from hpotk.graph import GraphFactory, CsrIndexedGraphFactory, OWL_THING
from hpotk.ontology import (
    MinimalOntology,
    Ontology,
    create_ontology,
    create_minimal_ontology,
)
from hpotk.util import open_text_io_handle_for_reading

from ._model import create_node, create_edge
from ._model import Node, Edge, NodeType
from ._factory import (
    MinimalTermFactory,
    TermFactory,
    ObographsTermFactory,
    MINIMAL_TERM,
)

logger = logging.getLogger(__name__)

# TODO: verify PURL works for other ontologies than HPO
# TODO: thoroughly test the PURL pattern
# A pattern to match an obolibrary PURL. The PURL should is expected to have 3 parts: `prefix`, `id`, and `curie`
# The `curie` is `prefix` + '_' + `id`.
PURL_PATTERN = re.compile(r"http://purl\.obolibrary\.org/obo/(?P<curie>(?P<prefix>\w+)_(?P<id>\w+))")
DATE_PATTERN = re.compile(r".*/(?P<date>\d{4}-\d{2}-\d{2})/.*")



[docs]
def load_minimal_ontology(
    file: typing.Union[typing.IO, str],
    term_factory: ObographsTermFactory[MinimalTerm] = MinimalTermFactory(),
    graph_factory: GraphFactory = CsrIndexedGraphFactory(),
    prefixes_of_interest: typing.Set[str] = {"HP"},
) -> MinimalOntology:
    return _load_impl(
        file,
        term_factory,
        graph_factory,
        prefixes_of_interest,
        create_minimal_ontology,
    )




[docs]
def load_ontology(
    file: typing.Union[typing.IO, str],
    term_factory: ObographsTermFactory[Term] = TermFactory(),
    graph_factory: GraphFactory = CsrIndexedGraphFactory(),
    prefixes_of_interest: typing.Set[str] = {"HP"},
) -> Ontology:
    return _load_impl(
        file,
        term_factory,
        graph_factory,
        prefixes_of_interest,
        create_ontology,
    )



def _load_impl(
    file: typing.Union[typing.IO, str],
    term_factory: ObographsTermFactory[MINIMAL_TERM],
    graph_factory: GraphFactory,
    prefixes_of_interest: typing.Set[str],
    ontology_creator,
):
    obograph = get_obographs_graph(file)
    logger.debug("Extracting ontology terms")
    id_to_term_id, terms = extract_terms(
        obograph["nodes"],
        term_factory,
        prefixes_of_interest=prefixes_of_interest,
    )
    logger.debug("Creating the edge list")
    edge_list = create_edge_list(obograph["edges"], id_to_term_id)
    logger.debug("Building ontology graph")
    ontology_graph: OntologyGraph = graph_factory.create_graph(edge_list)
    if ontology_graph.root == OWL_THING:
        # TODO: - consider adding Owl thing into terms list
        pass
    version = extract_ontology_version(obograph["meta"])
    logger.debug("Assembling the ontology")
    ontology = ontology_creator(ontology_graph, terms, version)
    logger.debug("Done")
    return ontology


def get_obographs_graph(file: typing.Union[typing.IO, str]):
    with open_text_io_handle_for_reading(file) as fh:
        document = json.load(fh)
    if not isinstance(document, dict):
        raise ValueError(f"The JSON document should have been a dict but was {type(document)}")
    if "graphs" not in document:
        raise ValueError("Did not find the `graphs` attribute in the JSON document")
    graphs = document["graphs"]
    if not isinstance(graphs, typing.Sequence):
        raise ValueError("`graphs` JSON attribute is not a sequence")
    if len(graphs) < 1:
        raise ValueError("`graphs` JSON attribute is empty")
    elif len(graphs) == 1:
        # The happy path
        return graphs[0]
    else:
        raise ValueError(f"We expect exactly 1 graph but there are {len(graphs)} graphs in the JSON document")


def extract_terms(
    nodes: typing.Iterable[dict],
    term_factory: ObographsTermFactory[MINIMAL_TERM],
    prefixes_of_interest: typing.Set[str],
) -> typing.Tuple[typing.Mapping[str, TermId], typing.Sequence[MINIMAL_TERM]]:
    curie_to_term: typing.Dict[str, TermId] = {}
    terms: typing.List[MINIMAL_TERM] = []
    for data in nodes:
        # 1) map data to `Node`
        node: typing.Optional[Node] = create_node(data)

        # 2) we only work with class Nodes
        if not node or node.type != NodeType.CLASS:
            continue

        # 3) check if PURL is OK
        curie = extract_curie_from_purl(node.id)
        if not curie:
            logger.debug("Unable to extract CURIE from PURL %s", node.id)
            continue
        term_id = TermId.from_curie(curie)
        if term_id.prefix not in prefixes_of_interest:
            logger.debug("Skipping not a term of interest %s", term_id.value)
            continue

        curie_to_term[curie] = term_id

        # 4) create the `Term`
        term = term_factory.create_term(term_id, node)
        if term:
            terms.append(term)

    return curie_to_term, terms


def create_edge_list(
    edges: typing.Iterable[typing.Dict[str, str]],
    curie_to_termid: typing.Mapping[str, TermId],
) -> typing.List[typing.Tuple[TermId, TermId]]:
    edge_list: typing.List[typing.Tuple[TermId, TermId]] = []
    for data in edges:
        edge: Edge = create_edge(data)

        # We only care about `is_a` relationships.
        if edge.pred != "is_a":
            logger.debug("Skipping edge with pred %s!='is_a'", edge.pred)
            continue

        # Get source and destination.
        src_curie = extract_curie_from_purl(edge.sub)
        if src_curie is None:
            logger.warning("Unable to extract CURIE from sub PURL %s", edge.sub)
            continue
        try:
            src: TermId = curie_to_termid[src_curie]
        except KeyError:
            logger.debug(
                "Skipping edge %s %s %s because subject %s was was not found in terms",
                edge.sub,
                edge.pred,
                edge.obj,
                edge.sub,
            )
            continue

        dest_curie = extract_curie_from_purl(edge.obj)
        if dest_curie is None:
            logger.warning("Unable to extract CURIE from obj PURL %s", edge.obj)
            continue
        try:
            dest: TermId = curie_to_termid[dest_curie]
        except KeyError:
            logger.debug(
                "Skipping edge %s %s %s because object %s was was not found in terms",
                edge.sub,
                edge.pred,
                edge.obj,
                edge.obj,
            )
            continue

        edge_list.append((src, dest))

    return edge_list


def extract_curie_from_purl(purl: str) -> typing.Optional[str]:
    """
    Parse HPO PURL (e.g. `http://purl.obolibrary.org/obo/HP_0002813`) into the CURIE (e.g. `HP_0002813`).

    Returns the CURIE `str` or `None` if the PURL is mis-formatted.
    """
    matcher = PURL_PATTERN.match(purl)
    return matcher.group("curie") if matcher else None


def extract_ontology_version(meta: dict) -> typing.Optional[str]:
    if "version" in meta:
        # A line like this:
        # 'http://purl.obolibrary.org/obo/hp/releases/2022-10-05/hp.json'
        match = DATE_PATTERN.search(meta["version"])
        if match:
            return match.group("date")
        else:
            logger.debug("Could not find a date pattern in version %s", meta["version"])
            return None
    elif "basicPropertyValues" in meta:
        for bpv in meta["basicPropertyValues"]:
            if "pred" in bpv and "val" in bpv:
                if bpv["pred"].endswith("#versionInfo"):
                    # An item like this:
                    # {
                    #   "pred": "http://www.w3.org/2002/07/owl#versionInfo",
                    #   "val": "2022-10-05"
                    # }
                    return bpv["val"]

        logger.debug("Could not find basic property value with the version info")
        return None
    else:
        logger.debug("Could not determine the ontology version")
        return None