Source code for hpotk.ontology.load.obographs._load

import re
import json
import typing
import logging

from hpotk.model import TermId, MinimalTerm, Term
from hpotk.graph import OntologyGraph
from hpotk.graph import GraphFactory, CsrIndexedGraphFactory, OWL_THING
from hpotk.ontology import MinimalOntology, Ontology, create_ontology, create_minimal_ontology
from hpotk.util import open_text_io_handle_for_reading

from ._model import create_node, create_edge
from ._model import Node, Edge, NodeType
from ._factory import MinimalTermFactory, TermFactory, ObographsTermFactory, MINIMAL_TERM

logger = logging.getLogger(__name__)

# TODO: verify PURL works for other ontologies than HPO
# TODO: thoroughly test the PURL pattern
# A pattern to match an obolibrary PURL. The PURL should is expected to have 3 parts: `prefix`, `id`, and `curie`
# The `curie` is `prefix` + '_' + `id`.
PURL_PATTERN = re.compile(r'http://purl\.obolibrary\.org/obo/(?P<curie>(?P<prefix>\w+)_(?P<id>\w+))')
DATE_PATTERN = re.compile(r'.*/(?P<date>\d{4}-\d{2}-\d{2})/.*')


[docs] def load_minimal_ontology( file: typing.Union[typing.IO, str], term_factory: ObographsTermFactory[MinimalTerm] = MinimalTermFactory(), graph_factory: GraphFactory = CsrIndexedGraphFactory(), prefixes_of_interest: typing.Set[str] = {'HP'}, ) -> MinimalOntology: return _load_impl( file, term_factory, graph_factory, prefixes_of_interest, create_minimal_ontology, )
[docs] def load_ontology( file: typing.Union[typing.IO, str], term_factory: ObographsTermFactory[Term] = TermFactory(), graph_factory: GraphFactory = CsrIndexedGraphFactory(), prefixes_of_interest: typing.Set[str] = {'HP'}, ) -> Ontology: return _load_impl( file, term_factory, graph_factory, prefixes_of_interest, create_ontology, )
def _load_impl( file: typing.Union[typing.IO, str], term_factory: ObographsTermFactory[MINIMAL_TERM], graph_factory: GraphFactory, prefixes_of_interest: typing.Set[str], ontology_creator, ): obograph = get_obographs_graph(file) logger.debug("Extracting ontology terms") id_to_term_id, terms = extract_terms( obograph['nodes'], term_factory, prefixes_of_interest=prefixes_of_interest, ) logger.debug("Creating the edge list") edge_list = create_edge_list(obograph['edges'], id_to_term_id) logger.debug("Building ontology graph") ontology_graph: OntologyGraph = graph_factory.create_graph(edge_list) if ontology_graph.root == OWL_THING: # TODO: - consider adding Owl thing into terms list pass version = extract_ontology_version(obograph['meta']) logger.debug("Assembling the ontology") ontology = ontology_creator(ontology_graph, terms, version) logger.debug("Done") return ontology def get_obographs_graph(file: typing.Union[typing.IO, str]): with open_text_io_handle_for_reading(file) as fh: document = json.load(fh) if not isinstance(document, dict): raise ValueError(f'The JSON document should have been a dict but was {type(document)}') if 'graphs' not in document: raise ValueError('Did not find the `graphs` attribute in the JSON document') graphs = document['graphs'] if not isinstance(graphs, typing.Sequence): raise ValueError('`graphs` JSON attribute is not a sequence') if len(graphs) < 1: raise ValueError('`graphs` JSON attribute is empty') elif len(graphs) == 1: # The happy path return graphs[0] else: raise ValueError(f'We expect exactly 1 graph but there are {len(graphs)} graphs in the JSON document') def extract_terms( nodes: typing.Iterable[dict], term_factory: ObographsTermFactory[MINIMAL_TERM], prefixes_of_interest: typing.Set[str], ) -> typing.Tuple[typing.Mapping[str, TermId], typing.Sequence[MINIMAL_TERM]]: curie_to_term: typing.Dict[str, TermId] = {} terms: typing.List[MINIMAL_TERM] = [] for data in nodes: # 1) map data to `Node` node: typing.Optional[Node] = create_node(data) # 2) we only work with class Nodes if not node or node.type != NodeType.CLASS: continue # 3) check if PURL is OK curie = extract_curie_from_purl(node.id) if not curie: logger.debug('Unable to extract CURIE from PURL %s', node.id) continue term_id = TermId.from_curie(curie) if term_id.prefix not in prefixes_of_interest: logger.debug('Skipping not a term of interest %s', term_id.value) continue curie_to_term[curie] = term_id # 4) create the `Term` term = term_factory.create_term(term_id, node) if term: terms.append(term) return curie_to_term, terms def create_edge_list( edges: typing.Iterable[typing.Dict[str, str]], curie_to_termid: typing.Mapping[str, TermId], ) -> typing.List[typing.Tuple[TermId, TermId]]: edge_list: typing.List[typing.Tuple[TermId, TermId]] = [] for data in edges: edge: Edge = create_edge(data) # We only care about `is_a` relationships. if edge.pred != 'is_a': logger.debug('Skipping edge with pred %s!=\'is_a\'', edge.pred) continue # Get source and destination. src_curie = extract_curie_from_purl(edge.sub) if src_curie is None: logger.warning('Unable to extract CURIE from sub PURL %s', edge.sub) continue try: src: TermId = curie_to_termid[src_curie] except KeyError: logger.debug( 'Skipping edge %s %s %s because subject %s was was not found in terms', edge.sub, edge.pred, edge.obj, edge.sub, ) continue dest_curie = extract_curie_from_purl(edge.obj) if dest_curie is None: logger.warning('Unable to extract CURIE from obj PURL %s', edge.obj) continue try: dest: TermId = curie_to_termid[dest_curie] except KeyError: logger.debug( 'Skipping edge %s %s %s because object %s was was not found in terms', edge.sub, edge.pred, edge.obj, edge.obj, ) continue edge_list.append((src, dest)) return edge_list def extract_curie_from_purl(purl: str) -> typing.Optional[str]: """ Parse HPO PURL (e.g. `http://purl.obolibrary.org/obo/HP_0002813`) into the CURIE (e.g. `HP_0002813`). Returns the CURIE `str` or `None` if the PURL is mis-formatted. """ matcher = PURL_PATTERN.match(purl) return matcher.group('curie') if matcher else None def extract_ontology_version(meta: dict) -> typing.Optional[str]: if 'version' in meta: # A line like this: # 'http://purl.obolibrary.org/obo/hp/releases/2022-10-05/hp.json' match = DATE_PATTERN.search(meta['version']) if match: return match.group('date') else: logger.debug('Could not find a date pattern in version %s', meta["version"]) return None elif 'basicPropertyValues' in meta: for bpv in meta['basicPropertyValues']: if 'pred' in bpv and 'val' in bpv: if bpv['pred'].endswith('#versionInfo'): # An item like this: # { # "pred": "http://www.w3.org/2002/07/owl#versionInfo", # "val": "2022-10-05" # } return bpv['val'] logger.debug('Could not find basic property value with the version info') return None else: logger.debug('Could not determine the ontology version') return None