Source code for kgx.cli.cli_utils

import importlib

import os
from os.path import dirname, abspath

import sys
from multiprocessing import Pool
from typing import List, Tuple, Optional, Dict, Set, Any, Union
import yaml

from kgx.validator import Validator
from kgx.sink import Sink
from kgx.transformer import Transformer, SOURCE_MAP, SINK_MAP
from kgx.config import get_logger
from kgx.graph.base_graph import BaseGraph
from kgx.graph_operations.graph_merge import merge_all_graphs
from kgx.graph_operations import summarize_graph, meta_knowledge_graph
from kgx.utils.kgx_utils import apply_graph_operations, knowledge_provenance_properties


summary_report_types = {
    "kgx-map": summarize_graph.GraphSummary,
    "meta-knowledge-graph": meta_knowledge_graph.MetaKnowledgeGraph,
}

log = get_logger()


[docs]def get_input_file_types() -> Tuple: """ Get all input file formats supported by KGX. Returns ------- Tuple A tuple of supported file formats """ return tuple(SOURCE_MAP.keys())
[docs]def get_output_file_types() -> Tuple: """ Get all output file formats supported by KGX. Returns ------- Tuple A tuple of supported file formats """ return tuple(SINK_MAP.keys())
[docs]def get_report_format_types() -> Tuple: """ Get all graph summary report formats supported by KGX. Returns ------- Tuple A tuple of supported file formats """ return "yaml", "json"
[docs]def graph_summary( inputs: List[str], input_format: str, input_compression: Optional[str], output: Optional[str], report_type: str, report_format: Optional[str] = None, stream: bool = False, graph_name: Optional[str] = None, node_facet_properties: Optional[List] = None, edge_facet_properties: Optional[List] = None, error_log: str = "", ) -> Dict: """ Loads and summarizes a knowledge graph from a set of input files. Parameters ---------- inputs: List[str] Input file input_format: str Input file format input_compression: Optional[str] The input compression type output: Optional[str] Where to write the output (stdout, by default) report_type: str The summary report type report_format: Optional[str] The summary report format file types: 'yaml' or 'json' stream: bool Whether to parse input as a stream graph_name: str User specified name of graph being summarized node_facet_properties: Optional[List] A list of node properties from which to generate counts per value for those properties. For example, ``['provided_by']`` edge_facet_properties: Optional[List] A list of edge properties (e.g. knowledge_source tags) to facet on. For example, ``['original_knowledge_source', 'aggregator_knowledge_source']`` error_log: str Where to write any graph processing error message (stderr, by default) Returns ------- Dict A dictionary with the graph stats """ if not graph_name: graph_name = "Graph" if report_format and report_format not in get_report_format_types(): raise ValueError(f"report_format must be one of {get_report_format_types()}") if report_type in summary_report_types: # New design pattern enabling 'stream' processing of statistics on a small memory footprint # by injecting an inspector in the Transformer.process() source-to-sink data flow. # # First, we instantiate the Inspector (generally, a Callable class)... # inspector = summary_report_types[report_type]( # ...thus, there is no need to hand the Inspector the graph; # rather, the inspector will see the graph data after # being injected into the Transformer.transform() workflow # graph=transformer.store.graph, name=graph_name, node_facet_properties=node_facet_properties, edge_facet_properties=edge_facet_properties, error_log=error_log, ) else: raise ValueError(f"report_type must be one of {summary_report_types.keys()}") if stream: output_args = { "format": "null" } # streaming processing throws the graph data away else: output_args = None transformer = Transformer(stream=stream) transformer.transform( input_args={ "filename": inputs, "format": input_format, "compression": input_compression, }, output_args=output_args, # ... Second, we inject the Inspector into the transform() call, # for the underlying Transformer.process() to use... inspector=inspector, ) if output: with open(output, "w") as gsr: inspector.save(gsr, file_format=report_format) else: inspector.save(sys.stdout, file_format=report_format) # ... Third, we directly return the graph statistics to the caller. return inspector.get_graph_summary()
[docs]def validate( inputs: List[str], input_format: str, input_compression: Optional[str], output: Optional[str], stream: bool, biolink_release: Optional[str] = None, ) -> List: """ Run KGX validator on an input file to check for Biolink Model compliance. Parameters ---------- inputs: List[str] Input files input_format: str The input format input_compression: Optional[str] The input compression type output: Optional[str] Path to output file (stdout, by default) stream: bool Whether to parse input as a stream. biolink_release: Optional[str] = None SemVer version of Biolink Model Release used for validation (default: latest Biolink Model Toolkit version) Returns ------- List Returns a list of errors, if any """ # New design pattern enabling 'stream' processing of statistics on a small memory footprint # by injecting an inspector in the Transformer.process() source-to-sink data flow. # # First, we instantiate a Validator() class (converted into a Callable class) as an Inspector ... # In the new "Inspector" design pattern, we need to instantiate it before the Transformer. # Validator.set_biolink_model(biolink_release) # Validator assumes the currently set Biolink Release validator = Validator() if stream: transformer = Transformer(stream=stream) transformer.transform( input_args={ "filename": inputs, "format": input_format, "compression": input_compression, }, output_args={ "format": "null" }, # streaming processing throws the graph data away # ... Second, we inject the Inspector into the transform() call, # for the underlying Transformer.process() to use... inspector=validator, ) else: # "Classical" non-streaming mode, with click.progressbar # but an unfriendly large memory footprint for large graphs transformer = Transformer() transformer.transform( { "filename": inputs, "format": input_format, "compression": input_compression, }, ) # Slight tweak of classical 'validate' function: that the # list of errors are cached internally in the Validator object validator.validate(transformer.store.graph) if output: validator.write_report(open(output, "w")) else: validator.write_report(sys.stdout) # ... Third, we return directly any validation errors to the caller return validator.get_errors()
[docs]def neo4j_download( uri: str, username: str, password: str, output: str, output_format: str, output_compression: Optional[str], stream: bool, node_filters: Optional[Tuple] = None, edge_filters: Optional[Tuple] = None, ) -> Transformer: """ Download nodes and edges from Neo4j database. Parameters ---------- uri: str Neo4j URI. For example, https://localhost:7474 username: str Username for authentication password: str Password for authentication output: str Where to write the output (stdout, by default) output_format: Optional[str] The output type (``tsv``, by default) output_compression: Optional[str] The output compression type stream: bool Whether to parse input as a stream node_filters: Optional[Tuple] Node filters edge_filters: Optional[Tuple] Edge filters Returns ------- kgx.Transformer The NeoTransformer """ transformer = Transformer(stream=stream) transformer.transform( { "uri": uri, "username": username, "password": password, "format": "neo4j", "node_filters": node_filters, "edge_filters": edge_filters, } ) if not output_format: output_format = "tsv" transformer.save( {"filename": output, "format": output_format, "compression": output_compression} ) return transformer
[docs]def neo4j_upload( inputs: List[str], input_format: str, input_compression: Optional[str], uri: str, username: str, password: str, stream: bool, node_filters: Optional[Tuple] = None, edge_filters: Optional[Tuple] = None, ) -> Transformer: """ Upload a set of nodes/edges to a Neo4j database. Parameters ---------- inputs: List[str] A list of files that contains nodes/edges input_format: str The input format input_compression: Optional[str] The input compression type uri: str The full HTTP address for Neo4j database username: str Username for authentication password: str Password for authentication stream: bool Whether to parse input as a stream node_filters: Optional[Tuple] Node filters edge_filters: Optional[Tuple] Edge filters Returns ------- kgx.Transformer The NeoTransformer """ transformer = Transformer(stream=stream) transformer.transform( { "filename": inputs, "format": input_format, "compression": input_compression, "node_filters": node_filters, "edge_filters": edge_filters, } ) transformer.save( {"uri": uri, "username": username, "password": password, "format": "neo4j"} ) return transformer
def _validate_files(cwd: str, file_paths: List[str], context: str = ""): """ Utility method for resolving file paths :param cwd: current working directory for resolving possible relative file path names :param file_list: list of file path names to resolve :param context: optional source context of of the file list :return: resolved list of file paths (as absolute paths) """ resolved_files: List[str] = list() for f in file_paths: # check if the file exists as an absolute path if not os.path.exists(f): # otherwise, check if file exists as a path # relative to the provided "current working directory" f = abspath(cwd + "/" + f) if not os.path.exists(f): raise FileNotFoundError( f"Filename '{f}' for source '{context}' does not exist!" ) if not os.path.isfile(f): raise FileNotFoundError( f"Filename '{f}' for source '{context}' is not a file!" ) resolved_files.append(f) return resolved_files def _process_knowledge_source(ksf: str, spec: str) -> Union[str, bool, Tuple]: if ksf not in knowledge_provenance_properties: log.warning("Unknown Knowledge Source Field: " + ksf + "... ignoring!") return False else: if spec.lower() == "true": return True elif spec.lower() == "false": return False else: # If a Tuple, expect a comma-delimited string? spec_parts = spec.split(",") if len(spec_parts) == 1: # assumed to be just a default string value for the knowledge source field return spec_parts[0] else: # assumed to be an InfoRes Tuple rewrite specification if len(spec_parts) > 3: spec_parts = spec_parts[:2] return tuple(spec_parts)
[docs]def transform( inputs: Optional[List[str]], input_format: Optional[str] = None, input_compression: Optional[str] = None, output: Optional[str] = None, output_format: Optional[str] = None, output_compression: Optional[str] = None, stream: bool = False, node_filters: Optional[List[Tuple[str, str]]] = None, edge_filters: Optional[List[Tuple[str, str]]] = None, transform_config: str = None, source: Optional[List] = None, knowledge_sources: Optional[List[Tuple[str, str]]] = None, # this parameter doesn't get used, but I leave it in # for now, in case it signifies an unimplemented concept # destination: Optional[List] = None, processes: int = 1, infores_catalog: Optional[str] = None, ) -> None: """ Transform a Knowledge Graph from one serialization form to another. Parameters ---------- inputs: Optional[List[str]] A list of files that contains nodes/edges input_format: Optional[str] The input format input_compression: Optional[str] The input compression type output: Optional[str] The output file output_format: Optional[str] The output format output_compression: Optional[str] The output compression type stream: bool Whether to parse input as a stream node_filters: Optional[List[Tuple[str, str]]] Node input filters edge_filters: Optional[List[Tuple[str, str]]] Edge input filters transform_config: Optional[str] The transform config YAML source: Optional[List] A list of source to load from the YAML knowledge_sources: Optional[List[Tuple[str, str]]] A list of named knowledge sources with (string, boolean or tuple rewrite) specification processes: int Number of processes to use infores_catalog: Optional[str] Optional dump of a TSV file of InfoRes CURIE to Knowledge Source mappings (not yet available in transform_config calling mode) """ if transform_config and inputs: raise ValueError("Can accept either --transform-config OR inputs, not both") output_directory = "output" if transform_config: # Use the directory within which the 'transform_config' file # exists as a 'current working directory' for # resolving relative filename paths in the configuration. cwd = dirname(transform_config) cfg = yaml.load(open(transform_config), Loader=yaml.FullLoader) top_level_args = {} if "configuration" in cfg: top_level_args = prepare_top_level_args(cfg["configuration"]) if ( "output_directory" in cfg["configuration"] and cfg["configuration"]["output_directory"] ): output_directory = cfg["configuration"]["output_directory"] if not output_directory.startswith(os.path.sep): # relative path output_directory = f"{os.path.abspath(os.path.dirname(transform_config))}{os.path.sep}{output_directory}" if not os.path.exists(output_directory): os.mkdir(output_directory) if not source: source = cfg["transform"]["source"].keys() for s in source: source_properties = cfg["transform"]["source"][s] if source_properties["input"]["format"] in get_input_file_types(): source_properties["input"]["filename"] = _validate_files( cwd=cwd, file_paths=source_properties["input"]["filename"], context=s, ) source_to_parse = {} for key, val in cfg["transform"]["source"].items(): if key in source: source_to_parse[key] = val results = [] pool = Pool(processes=processes) for k, v in source_to_parse.items(): log.info(f"Spawning process for '{k}'") result = pool.apply_async( transform_source, ( k, v, output_directory, top_level_args["prefix_map"], top_level_args["node_property_predicates"], top_level_args["predicate_mappings"], top_level_args["reverse_prefix_map"], top_level_args["reverse_predicate_mappings"], top_level_args["property_types"], top_level_args["checkpoint"], False, stream, ), ) results.append(result) pool.close() pool.join() graphs = [r.get() for r in results] else: source_dict: Dict = { "input": { "format": input_format, "compression": input_compression, "filename": inputs, "filters": { "node_filters": node_filters, "edge_filters": edge_filters, }, }, "output": { "format": output_format, "compression": output_compression, "filename": output, }, } if knowledge_sources: for ksf, spec in knowledge_sources: ksf_spec = _process_knowledge_source(ksf, spec) if isinstance(ksf_spec, tuple): if ksf not in source_dict["input"]: source_dict["input"][ksf] = dict() if isinstance(source_dict["input"][ksf], dict): key = ksf_spec[0] source_dict["input"][ksf][key] = ksf_spec else: # Unexpected condition - mixing static values with tuple specified rewrites? raise RuntimeError( "Inconsistent multivalued specifications: make sure that all the values " + "of the knowledge source tag '" + ksf + "' are all rewrite specifications!" ) else: source_dict["input"][ksf] = ksf_spec name = os.path.basename(inputs[0]) transform_source( key=name, source=source_dict, output_directory=None, stream=stream, infores_catalog=infores_catalog, )
[docs]def merge( merge_config: str, source: Optional[List] = None, destination: Optional[List] = None, processes: int = 1, ) -> BaseGraph: """ Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph. The merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file. Parameters ---------- merge_config: str Merge config YAML source: Optional[List] A list of source to load from the YAML destination: Optional[List] A list of destination to write to, as defined in the YAML processes: int Number of processes to use Returns ------- kgx.graph.base_graph.BaseGraph The merged graph """ # Use the directory within which the 'merge_config' file # exists as a 'current working directory' for # resolving relative filename paths in the configuration. cwd = dirname(merge_config) with open(merge_config, "r") as YML: cfg = yaml.load(YML, Loader=yaml.FullLoader) output_directory = "output" top_level_args = {} if "configuration" in cfg: top_level_args = prepare_top_level_args(cfg["configuration"]) if ( "output_directory" in cfg["configuration"] and cfg["configuration"]["output_directory"] ): output_directory = cfg["configuration"]["output_directory"] if not output_directory.startswith(os.path.sep): # relative path output_directory = f"{os.path.abspath(os.path.dirname(merge_config))}{os.path.sep}{output_directory}" if not os.path.exists(output_directory): os.mkdir(output_directory) if not source: source = cfg["merged_graph"]["source"].keys() if not destination: destination = cfg["merged_graph"]["destination"].keys() for s in source: source_properties = cfg["merged_graph"]["source"][s] if source_properties["input"]["format"] in get_input_file_types(): source_properties["input"]["filename"] = _validate_files( cwd=cwd, file_paths=source_properties["input"]["filename"], context=s ) sources_to_parse = {} for key in cfg["merged_graph"]["source"]: if key in source: sources_to_parse[key] = cfg["merged_graph"]["source"][key] results = [] pool = Pool(processes=processes) for k, v in sources_to_parse.items(): log.info(f"Spawning process for '{k}'") result = pool.apply_async( parse_source, ( k, v, output_directory, top_level_args["prefix_map"], top_level_args["node_property_predicates"], top_level_args["predicate_mappings"], top_level_args["checkpoint"], ), ) results.append(result) pool.close() pool.join() stores = [r.get() for r in results] merged_graph = merge_all_graphs([x.graph for x in stores]) log.info( f"Merged graph has {merged_graph.number_of_nodes()} nodes and {merged_graph.number_of_edges()} edges" ) if "name" in cfg["merged_graph"]: merged_graph.name = cfg["merged_graph"]["name"] if "operations" in cfg["merged_graph"]: apply_graph_operations(merged_graph, cfg["merged_graph"]["operations"]) destination_to_write: Dict[str, Dict] = {} for d in destination: if d in cfg["merged_graph"]["destination"]: destination_to_write[d] = cfg["merged_graph"]["destination"][d] else: raise KeyError(f"Cannot find destination '{d}' in YAML") # write the merged graph node_properties = set() edge_properties = set() for s in stores: node_properties.update(s.node_properties) edge_properties.update(s.edge_properties) input_args = {"graph": merged_graph, "format": "graph"} if destination_to_write: for key, destination_info in destination_to_write.items(): log.info(f"Writing merged graph to {key}") output_args = { "format": destination_info["format"], "reverse_prefix_map": top_level_args["reverse_prefix_map"], "reverse_predicate_mappings": top_level_args[ "reverse_predicate_mappings" ], } if "reverse_prefix_map" in destination_info: output_args["reverse_prefix_map"].update( destination_info["reverse_prefix_map"] ) if "reverse_predicate_mappings" in destination_info: output_args["reverse_predicate_mappings"].update( destination_info["reverse_predicate_mappings"] ) if destination_info["format"] == "neo4j": output_args["uri"] = destination_info["uri"] output_args["username"] = destination_info["username"] output_args["password"] = destination_info["password"] elif destination_info["format"] in get_input_file_types(): filename = destination_info["filename"] if isinstance(filename, list): filename = filename[0] destination_filename = f"{output_directory}/{filename}" output_args["filename"] = destination_filename output_args["compression"] = ( destination_info["compression"] if "compression" in destination_info else None ) if destination_info['format'] == 'nt': output_args['property_types'] = top_level_args['property_types'] if 'property_types' in top_level_args and 'property_types' in destination_info.keys(): output_args['property_types'].update(destination_info['property_types']) if destination_info['format'] in {'csv', 'tsv'}: output_args['node_properties'] = node_properties output_args['edge_properties'] = edge_properties else: raise TypeError( f"type {destination_info['format']} not yet supported for KGX merge operation." ) transformer = Transformer() transformer.transform(input_args, output_args) else: log.warning( f"No destination provided in {merge_config}. The merged graph will not be persisted." ) return merged_graph
[docs]def parse_source( key: str, source: dict, output_directory: str, prefix_map: Dict[str, str] = None, node_property_predicates: Set[str] = None, predicate_mappings: Dict[str, str] = None, checkpoint: bool = False, ) -> Sink: """ Parse a source from a merge config YAML. Parameters ---------- key: str Source key source: Dict Source configuration output_directory: str Location to write output to prefix_map: Dict[str, str] Non-canonical CURIE mappings node_property_predicates: Set[str] A set of predicates that ought to be treated as node properties (This is applicable for RDF) predicate_mappings: Dict[str, str] A mapping of predicate IRIs to property names (This is applicable for RDF) checkpoint: bool Whether to serialize each individual source to a TSV Returns ------- kgx.sink.sink.Sink Returns an instance of Sink """ log.info(f"Processing source '{key}'") if not key: key = os.path.basename(source["input"]["filename"][0]) input_args = prepare_input_args( key, source, output_directory, prefix_map, node_property_predicates, predicate_mappings, ) transformer = Transformer() transformer.transform(input_args) transformer.store.graph.name = key if checkpoint: log.info(f"Writing checkpoint for source '{key}'") checkpoint_output = f"{output_directory}/{key}" if output_directory else key transformer.save({"filename": checkpoint_output, "format": "tsv"}) # Current "Callable" metadata not needed at this point # but causes peculiar problems downstream, so we clear it. transformer.store.clear_graph_metadata() return transformer.store
[docs]def transform_source( key: str, source: Dict, output_directory: Optional[str], prefix_map: Dict[str, str] = None, node_property_predicates: Set[str] = None, predicate_mappings: Dict[str, str] = None, reverse_prefix_map: Dict = None, reverse_predicate_mappings: Dict = None, property_types: Dict = None, checkpoint: bool = False, preserve_graph: bool = True, stream: bool = False, infores_catalog: Optional[str] = None, ) -> Sink: """ Transform a source from a transform config YAML. Parameters ---------- key: str Source key source: Dict Source configuration output_directory: Optional[str] Location to write output to prefix_map: Dict[str, str] Non-canonical CURIE mappings node_property_predicates: Set[str] A set of predicates that ought to be treated as node properties (This is applicable for RDF) predicate_mappings: Dict[str, str] A mapping of predicate IRIs to property names (This is applicable for RDF) reverse_prefix_map: Dict[str, str] Non-canonical CURIE mappings for export reverse_predicate_mappings: Dict[str, str] A mapping of property names to predicate IRIs (This is applicable for RDF) property_types: Dict[str, str] The xml property type for properties that are other than ``xsd:string``. Relevant for RDF export. checkpoint: bool Whether to serialize each individual source to a TSV preserve_graph: true Whether or not to preserve the graph corresponding to the source stream: bool Whether to parse input as a stream infores_catalog: Optional[str] Optional dump of a TSV file of InfoRes CURIE to Knowledge Source mappings Returns ------- kgx.sink.sink.Sink Returns an instance of Sink """ log.info(f"Processing source '{key}'") input_args = prepare_input_args( key, source, output_directory, prefix_map, node_property_predicates, predicate_mappings, ) output_args = prepare_output_args( key, source, output_directory, reverse_prefix_map, reverse_predicate_mappings, property_types, ) transformer = Transformer(stream=stream, infores_catalog=infores_catalog) transformer.transform(input_args, output_args) if not preserve_graph: transformer.store.graph.clear() if infores_catalog: with open(infores_catalog, "w") as irc: catalog: Dict[str, str] = transformer.get_infores_catalog() for source in catalog.keys(): infores = catalog.setdefault(source, "unknown") print(f"{source}\t{infores}", file=irc) return transformer.store
[docs]def prepare_input_args( key: str, source: Dict, output_directory: Optional[str], prefix_map: Dict[str, str] = None, node_property_predicates: Set[str] = None, predicate_mappings: Dict[str, str] = None, ) -> Dict: """ Prepare input arguments for Transformer. Parameters ---------- key: str Source key source: Dict Source configuration output_directory: str Location to write output to prefix_map: Dict[str, str] Non-canonical CURIE mappings node_property_predicates: Set[str] A set of predicates that ought to be treated as node properties (This is applicable for RDF) predicate_mappings: Dict[str, str] A mapping of predicate IRIs to property names (This is applicable for RDF) Returns ------- Dict Input arguments as dictionary """ if not key: key = os.path.basename(source["input"]["filename"][0]) input_format = source["input"]["format"] input_compression = ( source["input"]["compression"] if "compression" in source["input"] else None ) inputs = source["input"]["filename"] filters = ( source["input"]["filters"] if "filters" in source["input"] and source["input"]["filters"] is not None else {} ) node_filters = filters["node_filters"] if "node_filters" in filters else {} edge_filters = filters["edge_filters"] if "edge_filters" in filters else {} source_prefix_map = prefix_map.copy() if prefix_map else {} source_prefix_map.update( source["prefix_map"] if "prefix_map" in source and source["prefix_map"] else {} ) source_predicate_mappings = predicate_mappings.copy() if predicate_mappings else {} source_predicate_mappings.update( source["predicate_mappings"] if "predicate_mappings" in source and source["predicate_mappings"] is not None else {} ) source_node_property_predicates = ( node_property_predicates if node_property_predicates else set() ) source_node_property_predicates.update( source["node_property_predicates"] if "node_property_predicates" in source and source["node_property_predicates"] is not None else set() ) if input_format in {"nt", "ttl"}: input_args = { "filename": inputs, "format": input_format, "compression": input_compression, "node_filters": node_filters, "edge_filters": edge_filters, "prefix_map": source_prefix_map, "predicate_mappings": source_predicate_mappings, "node_property_predicates": source_node_property_predicates, } elif input_format in get_input_file_types(): input_args = { "filename": inputs, "format": input_format, "compression": input_compression, "node_filters": node_filters, "edge_filters": edge_filters, "prefix_map": source_prefix_map, } elif input_format == "neo4j": input_args = { "uri": source["uri"], "username": source["username"], "password": source["password"], "format": input_format, "node_filters": node_filters, "edge_filters": edge_filters, "prefix_map": prefix_map, } else: raise TypeError(f"Type {input_format} not yet supported") for ksf in knowledge_provenance_properties: if ksf in source["input"]: input_args[ksf] = source["input"][ksf] input_args["operations"] = source["input"].get("operations", []) for o in input_args["operations"]: args = o["args"] if "filename" in args: filename = args["filename"] if not filename.startswith(output_directory): o["args"] = os.path.join(output_directory, filename) return input_args
[docs]def prepare_output_args( key: str, source: Dict, output_directory: Optional[str], reverse_prefix_map: Dict = None, reverse_predicate_mappings: Dict = None, property_types: Dict = None, ) -> Dict: """ Prepare output arguments for Transformer. Parameters ---------- key: str Source key source: Dict Source configuration output_directory: str Location to write output to reverse_prefix_map: Dict[str, str] Non-canonical CURIE mappings for export reverse_predicate_mappings: Dict[str, str] A mapping of property names to predicate IRIs (This is applicable for RDF) property_types: Dict[str, str] The xml property type for properties that are other than ``xsd:string``. Relevant for RDF export. Returns ------- Dict Output arguments as dictionary """ output_format = source["output"]["format"] output_compression = ( source["output"]["compression"] if "compression" in source["output"] else None ) output_filename = ( source["output"]["filename"] if "filename" in source["output"] else key ) source_reverse_prefix_map = reverse_prefix_map.copy() if reverse_prefix_map else {} source_reverse_prefix_map.update( source["reverse_prefix_map"] if "reverse_prefix_map" in source and source["reverse_prefix_map"] else {} ) source_reverse_predicate_mappings = ( reverse_predicate_mappings.copy() if reverse_predicate_mappings else {} ) source_reverse_predicate_mappings.update( source["reverse_predicate_mappings"] if "reverse_predicate_mappings" in source and source["reverse_predicate_mappings"] is not None else {} ) source_property_types = property_types.copy() if property_types else {} source_property_types.update( source["property_types"] ) if "property_types" in source and source["property_types"] is not None else {} if isinstance(output_filename, list): output = output_filename[0] else: output = output_filename if output_directory and not output.startswith(output_directory): output = os.path.join(output_directory, output) output_args = {"format": output_format} if output_format == "neo4j": output_args["uri"] = source["output"]["uri"] output_args["username"] = source["output"]["username"] output_args["password"] = source["output"]["password"] elif output_format in get_input_file_types(): output_args["filename"] = output output_args["compression"] = output_compression if output_format == "nt": output_args["reify_all_edges"] = ( source["output"]["reify_all_edges"] if "reify_all_edges" in source["output"] else False ) output_args["reverse_prefix_map"] = source_reverse_prefix_map output_args[ "reverse_predicate_mappings" ] = source_reverse_predicate_mappings output_args["property_types"] = source_property_types else: raise ValueError(f"type {output_format} not yet supported for output") return output_args
[docs]def apply_operations(source: dict, graph: BaseGraph) -> BaseGraph: """ Apply operations as defined in the YAML. Parameters ---------- source: dict The source from the YAML graph: kgx.graph.base_graph.BaseGraph The graph corresponding to the source Returns ------- kgx.graph.base_graph.BaseGraph The graph corresponding to the source """ operations = source["operations"] for operation in operations: op_name = operation["name"] op_args = operation["args"] module_name = ".".join(op_name.split(".")[0:-1]) function_name = op_name.split(".")[-1] f = getattr(importlib.import_module(module_name), function_name) log.info(f"Applying operation {op_name} with args: {op_args}") f(graph, **op_args) return graph
[docs]def prepare_top_level_args(d: Dict) -> Dict: """ Parse top-level configuration. Parameters ---------- d: Dict The configuration section from the transform/merge YAML Returns ------- Dict A parsed dictionary with parameters from configuration """ args = {} if "checkpoint" in d and d["checkpoint"] is not None: args["checkpoint"] = d["checkpoint"] else: args["checkpoint"] = False if "node_property_predicates" in d and d["node_property_predicates"]: args["node_property_predicates"] = set(d["node_property_predicates"]) else: args["node_property_predicates"] = set() if "predicate_mappings" in d and d["predicate_mappings"]: args["predicate_mappings"] = d["predicate_mappings"] else: args["predicate_mappings"] = {} if "prefix_map" in d and d["prefix_map"]: args["prefix_map"] = d["prefix_map"] else: args["prefix_map"] = {} if "reverse_prefix_map" in d and d["reverse_prefix_map"] is not None: args["reverse_prefix_map"] = d["reverse_prefix_map"] else: args["reverse_prefix_map"] = {} if ( "reverse_predicate_mappings" in d and d["reverse_predicate_mappings"] is not None ): args["reverse_predicate_mappings"] = d["reverse_predicate_mappings"] else: args["reverse_predicate_mappings"] = {} if "property_types" in d and d["property_types"]: args["property_types"] = d["property_types"] else: args["property_types"] = {} return args