KG microbe: TSV files into one RDF (turtle) file

import csv
import json
import re
import urllib.parse

Relative paths to the input and output files

edgesPath = "input/merged-kg_edges.tsv"
nodesPath = "input/merged-kg_nodes.tsv"
bioregistryPath = "input/registry.json"
outputPath = "output/kg-microbe.ttl"

All used prefixes are defined here

prefixes = {
    "biolink": "",
    "rdfs": "",
    "rdf": "",
    "owl": "",
    "dc": "",
    "obo": "",
    "oio": "",
    "wd": "",
    "bioregistry": "",
    "medi": "",
    "meds": "",
    "medm": ""

Load prefixes from registry.json which should be downloaded from

bioregistry_prefixes = {}
f = open(bioregistryPath)
data = json.load(f)
for entity in data.values():
    bioregistry_prefixes[entity["prefix"]] = {"name": entity["name"], "uri_format": entity["uri_format"]}
del data

Here we start to write prefixes into the output file

outputStream = open(outputPath, "w")

for p, ns in prefixes.items():
    outputStream.write(f"@prefix {p}: <{ns}> .\n")


Helpers for saving triples and uri extraction. All uris are resolved from the iri columns and then from if iri is unknown. All unresolved uris are replaced with the following urn format: <urn:unknown:id>.

def add_triple(s: str, p: str, o: str):
    outputStream.write(f"{s} {p} {o} .\n")
    # print(f"{s} {p} {o} .")

def add_label(s: str, label: str):
    add_triple(s, "rdfs:label", json.dumps(label))

def add_type(s: str, t: str):
    add_triple(s, "rdf:type", t)

def add_synonym(s: str, syn: str):
    add_triple(s, "biolink:synonym", json.dumps(syn))
def add_reference(s: str, ref: str):
    add_triple(s, "dc:identifier", json.dumps(ref))

def extract_uri(id: str):
    uri_parts = id.split(":", 1)
    if len(uri_parts) == 1:
        encoded_id = urllib.parse.quote(id, safe="")
        return f"urn:unknown:{encoded_id}"
    elif len(uri_parts) > 1:
        prefix = uri_parts[0].lower()
        def_prefix = bioregistry_prefixes.get(prefix)
        if def_prefix is None:
            encoded_id = urllib.parse.quote(id, safe="")
            return f"urn:unknown:{encoded_id}"
            uf = def_prefix["uri_format"]  # type: str
            if uf is None or "$1" not in uf:
                return f"{prefix}:{uri_parts[1]}"
                return uf.replace("$1", uri_parts[1])
    return None

First, we add user-defined triples.

add_triple("biolink:synonym", "rdfs:label", "\"Synonym\"")
add_triple("dc:identifier", "rdfs:label", "\"Reference\"")

Nodes extraction into the output file

It includes:

  • labels rdfs:label
  • types rdf:type as biolink:<type>
  • synonyms biolink:synonym
  • reference dc:identifier with the entity uri.
i = 0
f_in = open(nodesPath, newline="")
reader = csv.reader(f_in, delimiter="\t")
rowsIt = iter(reader)
header = {k: v for v, k in enumerate(next(rowsIt))}
id_to_uri = {}
for row in rowsIt:
    i += 1
    if i % 50000 == 0: print(f"processed lines: {i}")
    uri = row[header["iri"]].split("|")[0].strip()
    id = row[header["id"]].strip()
    if not uri:
        uri = extract_uri(id)
    if not uri: continue
    puri = None
    for p, ns in prefixes.items():
        if uri.startswith(ns):
            puri = f"{p}:{re.sub("([~.!$&'\"()*+,;=/?#@%])", r"\\\1", uri.lstrip(ns))}"
    s = puri if puri else f"<{uri}>"
    id_to_uri[id] = s
    n = str(row[header["name"]]).strip()
    if len(n) > 0: add_label(s, n)
    if not uri.startswith("urn:unknown:"):
        add_reference(s, uri)
    for syn in row[header["synonym"]].split("|"):
        syn = syn.strip()
        if len(syn) > 0 and syn != n: add_synonym(s, syn)
    for t in str(row[header['category']]).split("|"):
        t = t.strip()
        if len(t) > 0: add_type(s, t)
print(f"(Done) processed lines: {i}")

Edges extraction into the output file

Triples with unknown uris with unknown prefix are skipped.

i = 0
f_in = open(edgesPath, newline="")
reader = csv.reader(f_in, delimiter="\t")
rowsIt = iter(reader)
header = {k: v for v, k in enumerate(next(rowsIt))}
unknown_predicates = set()
used_predicates = set()
for row in rowsIt:
    i += 1
    if i % 50000 == 0: print(f"processed lines: {i}")
    s = id_to_uri.get(row[header["subject"]].strip())
    o = id_to_uri.get(row[header["object"]].strip())
    p = row[header["predicate"]].strip()
    p_parts = p.split(":", 1)
    if s and o and len(p_parts) == 2 and p_parts[0] in prefixes:
        add_triple(s, p, o)
print(f"(Done) processed lines: {i}")
if len(unknown_predicates) > 0:
    print(f"Unknown predicates: {unknown_predicates}")

Add labels of all used predicates into the output file.

for p in used_predicates:
    add_label(p, re.sub("([A-Z])", r"_\1", p.split(":", 1)[1]).replace("_", " ").lower().strip())

Close the output file writing.




