Module tf.search.semantics

Semantics of search templates

Expand source code Browse git
"""
# Semantics of search templates
"""

import types
import re

from .relations import add_K_Relations, add_F_Relations, add_V_Relations
from .syntax import reTp, kRe, deContext

# SEMANTIC ANALYSIS OF SEARCH TEMPLATE ###


def semantics(searchExe):
    if not searchExe.good:
        return
    error = searchExe.api.TF.error
    _msgCache = searchExe._msgCache
    searchExe.badSemantics = []
    offset = searchExe.offset

    _grammar(searchExe)

    if not searchExe.good:
        searchExe.showOuterTemplate(_msgCache)
        for (i, line) in enumerate(searchExe.searchLines):
            error(f"{i + offset:>2} {line}", tm=False, cache=_msgCache)
        for (ln, eline) in searchExe.badSemantics:
            txt = eline if ln is None else f"line {ln + offset}: {eline}"
            error(txt, tm=False, cache=_msgCache)
        return

    if searchExe.good:
        _validation(searchExe)
    if not searchExe.good:
        searchExe.showOuterTemplate(_msgCache)
        for (i, line) in enumerate(searchExe.searchLines):
            error(f"{i + offset:>2} {line}", tm=False, cache=_msgCache)
        for (ln, eline) in searchExe.badSemantics:
            txt = eline if ln is None else f"line {ln + offset}: {eline}"
            error(txt, tm=False, cache=_msgCache)


def _grammar(searchExe):
    prevKind = None
    good = True
    qnames = {}
    qnodes = []
    qedges = []
    edgeLine = {}
    nodeLine = {}
    nTokens = len(searchExe.tokens)

    def tokenSort(t):
        return (nTokens + t["ln"]) if t["kind"] == "rel" else t["ln"]

    tokens = sorted(searchExe.tokens, key=tokenSort)

    # atomStack is a stack of qnodes with their indent levels
    # such that every next member is one level deeper
    # and every member is the last qnode encountered at that level
    # The stack is implemented as a dict,
    # keyed by the indent, and valued by the qnode
    atomStack = {}

    for token in tokens:
        i = token["ln"]
        kind = token["kind"]
        if kind == "atom":
            if "quantifiers" in token:
                token["quantifiers"] = [
                    deContext(q, token["name"]) for q in token["quantifiers"]
                ]
            indent = token["indent"]
            op = token["op"]
            if "name" in token:
                name = token["name"]
                otype = token["otype"]
                features = token["features"]
                src = token.get("src", "")
                quantifiers = token.get("quantifiers", [])
                qnodes.append((otype, features, src, quantifiers))
                q = len(qnodes) - 1
                nodeLine[q] = i
                name = f":{i}" if name == "" else name
                qnames[name] = q
            if len(atomStack) == 0:
                if indent > 0:
                    searchExe.badSemantics.append(
                        (i, f"Unexpected indent: {indent}, expected 0")
                    )
                    good = False
                if op is not None:
                    searchExe.badSemantics.append(
                        (i, "Lonely relation: not allowed at outermost level")
                    )
                    good = False
                if "name" in token:
                    atomStack[0] = q
            else:
                atomNest = sorted(atomStack.items(), key=lambda x: x[0])
                top = atomNest[-1]
                if indent == top[0]:
                    # sibling of previous atom
                    if len(atomNest) > 1:
                        if "name" in token:
                            # take the qnode of the subtop of the
                            # atomStack, if there is one
                            qedges.append((q, "]]", atomNest[-2][1]))
                            edgeLine[len(qedges) - 1] = i
                            if op is not None:
                                qedges.append((top[1], op, q))
                                edgeLine[len(qedges) - 1] = i
                        else:
                            # lonely operator:
                            # left is previous atom, right is parent atom
                            qedges.append((top[1], op, atomNest[-2][1]))
                            edgeLine[len(qedges) - 1] = i
                    else:
                        if op is not None:
                            qedges.append((top[1], op, q))
                            edgeLine[len(qedges) - 1] = i
                elif indent > top[0]:
                    if "name" in token:
                        # child of previous atom
                        qedges.append((q, "]]", top[1]))
                        edgeLine[len(qedges) - 1] = i
                        if op is not None:
                            qedges.append((top[1], op, q))
                            edgeLine[len(qedges) - 1] = i
                    else:
                        searchExe.badSemantics.append(
                            (i, "Lonely relation: not allowed as first child")
                        )
                        good = False
                else:
                    # outdent action:
                    # look up the proper parent in the stack
                    if indent in atomStack:
                        parents = [at[1] for at in atomNest if at[0] < indent]
                        if "name" in token:
                            if op is not None:
                                qedges.append((atomStack[indent], op, q))
                                edgeLine[len(qedges) - 1] = i
                        if len(parents) != 0:  # if not already at outermost level
                            if "name" in token:
                                qedges.append((q, "]]", parents[-1]))
                                edgeLine[len(qedges) - 1] = i
                            else:
                                # connect previous sibling to parent
                                qedges.append((atomStack[indent], op, parents[-1]))
                                edgeLine[len(qedges) - 1] = i
                        removeKeys = [at[0] for at in atomNest if at[0] > indent]
                        for rk in removeKeys:
                            del atomStack[rk]
                    else:
                        # parent cannot be found: indentation error
                        searchExe.badSemantics.append(
                            (
                                i,
                                "Unexpected indent: {}, expected one of {}".format(
                                    indent,
                                    ", ".join(
                                        str(at[0]) for at in atomNest if at[0] < indent
                                    ),
                                ),
                            )
                        )
                        good = False
                atomStack[indent] = q
        elif kind == "feat":
            features = token["features"]
            if prevKind is not None and prevKind not in {"atom", "feat"}:
                searchExe.badSemantics.append(
                    (i, f'Features after {prevKind}: "{features}"')
                )
                good = False
            else:
                if len(qnodes):
                    qnodes[-1][1].update(features)
        elif kind == "rel":
            fName = token["f"]
            tName = token["t"]
            op = token["op"]
            f = qnames.get(fName, None)
            t = qnames.get(tName, None)
            namesGood = True
            for (q, n) in ((f, fName), (t, tName)):
                if q is None:
                    searchExe.badSemantics.append(
                        (i, f'Relation with undefined name: "{n}"')
                    )
                    namesGood = False
            if not namesGood:
                good = False
            else:
                qedges.append((f, op, t))
                edgeLine[len(qedges) - 1] = i
        prevKind = kind

    # resolve names when used in atoms
    for (q, qdata) in enumerate(qnodes):
        otype = qdata[0]
        referQ = qnames.get(otype, None)
        if referQ is not None:
            referOtype = qnodes[referQ][0]
            qnodes[q] = (referOtype, *qdata[1:])
            qedges.append((q, "=", referQ))

    if good:
        searchExe.qnames = qnames
        searchExe.qnodes = qnodes
        searchExe.qedgesRaw = qedges
        searchExe.nodeLine = nodeLine
        searchExe.edgeLine = edgeLine
    else:
        searchExe.good = False


def _validateFeature(
    searchExe,
    q,
    fName,
    features,
    missingFeatures,
    wrongValues,
    hasValues={},
    asEdge=False,
):
    values = features[fName]
    fSet = "edges" if asEdge else "nodes"
    if fName not in searchExe.api.TF.featureSets[fSet]:
        missingFeatures.setdefault(fName, []).append(q)
    else:
        if asEdge:
            doValues = searchExe.api.TF.features[fName].edgeValues
            if not doValues and values is not True:
                hasValues.setdefault(fName, {}).setdefault(values, []).append(q)
                return
        requiredType = searchExe.api.TF.features[fName].dataType
        if values is True:
            return
        elif values is None:
            return
        elif isinstance(values, types.FunctionType):
            if requiredType == "str":
                wrongValues.setdefault(fName, {}).setdefault(values, []).append(q)
        elif isinstance(values, reTp):
            if requiredType == "int":
                wrongValues.setdefault(fName, {}).setdefault(values, []).append(q)
        else:
            valuesCast = set()
            if requiredType == "int":
                (ident, values) = values
                if type(values) is not bool:
                    for val in values:
                        try:
                            valCast = int(val)
                        except Exception:
                            valCast = val
                            wrongValues.setdefault(fName, {}).setdefault(
                                val, []
                            ).append(q)
                        valuesCast.add(valCast)
                    features[fName] = (ident, frozenset(valuesCast))


def _validation(searchExe):
    levels = searchExe.api.C.levels.data
    otypes = {x[0] for x in levels}
    qnodes = searchExe.qnodes
    nodeLine = searchExe.nodeLine
    edgeMap = searchExe.edgeMap
    nodeMap = searchExe.nodeMap

    edgeLine = searchExe.edgeLine
    relationFromName = searchExe.relationFromName

    offset = searchExe.offset

    # check the object types of atoms

    good = True
    otypesGood = True
    sets = searchExe.sets
    for (q, qdata) in enumerate(qnodes):
        otype = qdata[0]
        if otype == ".":
            continue
        if sets is not None and otype in sets:
            continue
        if otype not in otypes:
            searchExe.badSemantics.append(
                (nodeLine[q], f'Unknown object type: "{otype}"')
            )
            otypesGood = False
    if not otypesGood:
        searchExe.badSemantics.append(
            (
                None,
                "Valid object types are: {}".format(", ".join(x[0] for x in levels)),
            )
        )
        if sets is not None:
            searchExe.badSemantics.append(
                (
                    None,
                    "Or choose a custom set from: {}".format(
                        ", ".join(x for x in sorted(sets)),
                    ),
                )
            )
        good = False

    # check the feature names of feature specs
    # and check the types of their values

    missingFeatures = {}
    wrongValues = {}
    wrongTypes = {}
    hasValues = {}

    for (q, qdata) in enumerate(qnodes):
        features = qdata[1]
        for fName in sorted(features):
            _validateFeature(
                searchExe, q, fName, features, missingFeatures, wrongValues
            )

    # check the relational operator token in edges
    # and replace them by an index
    # in the relations list of known relations
    qedges = []
    edgesGood = True

    # relations may have a variable number k in them (k-nearness, etc.)
    # make an entry in the relation map for each value of k
    addRels = {}
    for (e, (f, op, t)) in enumerate(searchExe.qedgesRaw):
        if (
            type(op) is tuple
            or (op[0] == "-" and op[-1] == ">")
            or (op[0] == "<" and op[-1] == "-")
            or (op[0] == "<" and op[-1] == ">")
            or (op[0] == "." and op[-1] == ".")
        ):
            continue
        match = kRe.findall(op)
        if len(match):
            (pre, k, post) = match[0]
            opNameK = f"{pre}k{post}"
            addRels.setdefault(opNameK, set()).add(int(k))
    if not missingFeatures and not wrongValues:
        add_K_Relations(searchExe, addRels)

    # relations may have one or two node features f,g in them (feature-comparison)
    # make an entry in the relation map for each value of (f, g)
    fPatOne = r"^\.([^=#<>]+)\.$"
    fPatBoth = r"^\.([^=#<>]+)([=#<>])(.*)\.$"
    fPatMatch = r"^\.([^~]+)~(.*?)~([^~]+)\.$"
    fOneRe = re.compile(fPatOne)
    fBothRe = re.compile(fPatBoth)
    fMatchRe = re.compile(fPatMatch)

    addRels = {}
    for (e, (f, op, t)) in enumerate(searchExe.qedgesRaw):
        if type(op) is tuple:
            continue
        match = fMatchRe.findall(op)
        if len(match):
            (fF, r, gF) = match[0]
            opNameFG = ".f~r~g."
            addRels.setdefault(opNameFG, set()).add(((f, fF), r, (t, gF)))
            for fName in (fF, gF):
                fType = searchExe.api.TF.features[fName].dataType
                if fType != "str":
                    wrongTypes.setdefault(fName, {}).setdefault(fType, set()).add(e)
        else:
            match = fBothRe.findall(op)
            if len(match):
                (fF, r, gF) = match[0]
                opNameFG = f".f{r}g."
                addRels.setdefault(opNameFG, set()).add(((f, fF), (t, gF)))
                if r in {"<", ">"}:
                    for fName in (fF, gF):
                        fType = searchExe.api.TF.features[fName].dataType
                        if fType != "int":
                            wrongTypes.setdefault(fName, {}).setdefault(
                                fType, set()
                            ).add(e)
            else:
                match = fOneRe.findall(op)
                if len(match):
                    opNameF = ".f."
                    fF = match[0]
                    addRels.setdefault(opNameF, set()).add(((f, fF), (t, fF)))
    if not missingFeatures and not wrongValues:
        add_F_Relations(searchExe, addRels)

    # edge relations may have a value spec in them
    # make an entry in the relation map for each value spec
    addRels = {}
    for (e, (f, op, t)) in enumerate(searchExe.qedgesRaw):
        if type(op) is not tuple:
            continue
        (opName, opFeatures) = op
        for eName in sorted(opFeatures):
            _validateFeature(
                searchExe,
                e,
                eName,
                opFeatures,
                missingFeatures,
                wrongValues,
                hasValues,
                asEdge=True,
            )
            addRels.setdefault(opName, set()).add((eName, opFeatures[eName]))
    if not missingFeatures and not wrongValues:
        add_V_Relations(searchExe, addRels)

    # now look up each particalur relation in the relation map
    for (e, (f, op, t)) in enumerate(searchExe.qedgesRaw):
        theOp = op[0] if type(op) is tuple else op
        rela = relationFromName.get(theOp, None)
        if rela is None:
            searchExe.badSemantics.append((edgeLine[e], f'Unknown relation: "{theOp}"'))
            edgesGood = False
        qedges.append((f, rela, t))
    if not edgesGood:
        searchExe.badSemantics.append(
            (None, f"Allowed relations:\n{searchExe.relationLegend}")
        )
        good = False

    # report error found above
    if len(missingFeatures):
        for (fName, qs) in sorted(missingFeatures.items()):
            searchExe.badSemantics.append(
                (
                    None,
                    'Missing feature "{}" in line(s) {}'.format(
                        fName, ", ".join(str(nodeLine[q] + offset) for q in qs),
                    ),
                )
            )
        good = False

    if len(hasValues):
        for (fName, wrongs) in sorted(hasValues.items()):
            searchExe.badSemantics.append(
                (None, f'Feature "{fName}" cannot have values:')
            )
            for (val, qs) in sorted(wrongs.items()):
                searchExe.badSemantics.append(
                    (
                        None,
                        '    "{}" superfluous: line(s) {}'.format(
                            val, ", ".join(str(nodeLine[q] + offset) for q in qs),
                        ),
                    )
                )
        good = False

    if len(wrongValues):
        for (fName, wrongs) in sorted(wrongValues.items()):
            searchExe.badSemantics.append(
                (None, f'Feature "{fName}" has wrong values:')
            )
            for (val, qs) in sorted(wrongs.items()):
                searchExe.badSemantics.append(
                    (
                        None,
                        '    "{}" is not a number: line(s) {}'.format(
                            val, ", ".join(str(nodeLine[q] + offset) for q in qs),
                        ),
                    )
                )
        good = False

    if len(wrongTypes):
        for (fName, wrongs) in sorted(wrongTypes.items()):
            searchExe.badSemantics.append((None, f'Feature "{fName}" has wrong type:'))
            for (val, qs) in sorted(wrongs.items()):
                searchExe.badSemantics.append(
                    (
                        None,
                        '    "{}" is the wrong type: line(s) {}'.format(
                            val, ", ".join(str(nodeLine[q] + offset) for q in qs),
                        ),
                    )
                )
        good = False

    searchExe.qedges = qedges

    # determine which node and edge features are not yet loaded,
    # and load them
    eFeatsUsed = set()
    for (f, rela, t) in qedges:
        efName = edgeMap.get(rela, (None,))[0]
        if efName is not None:
            eFeatsUsed.add(efName)
    nFeatsUsed = set()
    for (n, qdata) in enumerate(qnodes):
        features = qdata[1]
        for nfName in features:
            nFeatsUsed.add(nfName)
        if n in nodeMap:
            nFeatsUsed |= nodeMap[n]

    if good:
        searchExe.api.ensureLoaded(eFeatsUsed | nFeatsUsed)
    else:
        searchExe.good = False

Functions

def semantics(searchExe)