Module tf.convert.helpers
Expand source code Browse git
import re
from textwrap import dedent
from ..core.helpers import console
from ..core.generic import AttrDict
PRE = "pre"
ZWSP = "\u200b" # zero-width space
NODE = "node"
FOLDER = "folder"
FILE = "file"
PAGE = "page"
LINE = "line"
LN = "ln"
REGION = "region"
DOC = "doc"
CHAPTER = "chapter"
CHUNK = "chunk"
XNEST = "xnest"
TNEST = "tnest"
TSIB = "tsiblings"
SLOT = "slot"
WORD = "word"
CHAR = "char"
TOKEN = "token"
T = "t"
LINE_MODELS = dict(
I=dict(),
II=dict(
element=(str, "p"),
nodeType=(str, LN),
),
)
LINE_MODEL_DEFAULT = "I"
PAGE_MODELS = dict(
I=dict(),
II=dict(
keepPb=(bool, False),
element=(str, "div"),
attributes=(dict, {}),
pbAtTop=(bool, True),
nodeType=(str, PAGE),
),
)
PAGE_MODEL_DEFAULT = "I"
SECTION_MODELS = dict(
I=dict(
levels=(list, [FOLDER, FILE, CHUNK]),
drillDownDivs=(bool, True),
backMatter=(str, "backmatter"),
),
II=dict(
levels=(list, [CHAPTER, CHUNK]),
element=(str, "head"),
attributes=(dict, {}),
),
)
"""Models for sections.
A section is a part of the corpus that is defined by a set of files,
or by elements within a single TEI source file.
A model
"""
SECTION_MODEL_DEFAULT = "I"
"""Default model for sections.
"""
CM_LIT = "literal"
"""The value is taken literally from a TEI attribute.
Code `tei`, since there is a 1-1 correspondence with the TEI source.
"""
CM_LITP = "literal-processed"
"""The value results from straightforward processing of material in the TEI.
Code `tei`, since there is a direct correspondence with the TEI source.
*Straightforward* means: by taking into account the semantics of XML.
Examples:
* Generated white-space based on whether elements are pure or mixed;
* Edges between parent and child elements, or sibling elements.
"""
CM_LITC = "literal-composed"
"""The value is results from more intricate processing of material in the TEI.
*More intricate means*: we derive data that goes beyond pure XML syntax.
Examples:
* The values of the `rend` attributes are translated into `rend_`*value* features;
* Adding features `is_meta` (being inside the TEI-header) and `is_note`
(being inside a note);
* The feature that gives the content of a (character) slot;
* Decomposing strings into words material and after-word material.
Code `tf`, since this is for the benefit of the resulting TF dataset.
"""
CM_PROV = "provided"
"""The value is added by the conversion to TF w.r.t. the material in the TEI.
Examples:
* Slots in empty elements, in order to anchor the element to the text sequence;
* Section levels, based on the folder and file that the TEI source is in;
* A section level within the TEI, defined from several elements and the way they
are nested;
Code `tf`, since this is for the benefit of the resulting TF dataset.
"""
CM_NLP = "nlp-generated"
"""The value is added by an NLP pipeline w.r.t. the material in the TEI.
Code `nlp`, since this comes from third party software.
Examples:
* The feature `nsent` which gives the sentence number in the corpus.
Sentences are not encoded in the TEI, but detected by an NLP program such as Spacy.
"""
CONVERSION_METHODS = {
CM_LIT: "tei",
CM_LITP: "tei",
CM_LITC: "tf",
CM_PROV: "tf",
CM_NLP: "nlp",
}
"""Information about the conversion.
When we produce TF features, we specify a bit of information in the feature
metadata as how we arrived at the specific value.
That information ends up in two keys:
* `conversionMethod`: with values any of:
* `CM_LIT`
* `CM_LITP`
* `CM_LITC`
* `CM_PROV`
* `CM_NLP`
* `conversionCode`: the value is derived from `conversionMethod` by looking it
up in this table. These values can be used to qualify the name of the attribute
for further processing.
For example, if you have a feature `n` that originates literally from the TEI,
you could pass it on as `tei:n`.
But if you have a feature `chapter` that is provided by the conversion,
you could pass it on as `tf:chapter`.
This passing on is a matter of other software, that takes the generated TF as
input and processes it further, e.g. as annotations.
!!! note "More methods and codes"
The TEI conversion is customizable by providing your own methods to several hooks
in the program. These hooks may generate extra features, which you can give metadata
in the `tei.yaml` file next to the `tei.py` file where you define the custom functions.
It is advised to state appropriate values for the `conversionMethod` and
`conversionCode` fields of these features.
Examples:
* A feature `country` is derived from specific elements in the TEI Header, and
defined for nodes of type `letter`.
This happens in order to support the software of Team Text that shows the
text on a webpage.
In such a case you could define
* `conversionMethod="derived"
* `conversionCode="tt"
"""
TOKEN_RE = re.compile(r"""\w+|\W""")
NUMBER_RE = re.compile(
r"""
^
[0-9]+
(?:
[.,]
[0-9]+
)*
$
""",
re.X,
)
W_BEFORE = re.compile(r"""^\s+""")
W_AFTER = re.compile(r"""\s+$""")
def getWhites(text):
match = W_BEFORE.match(text)
if match:
before = match.group(0)
rest = text[len(before) :]
else:
before = ""
rest = text
match = W_AFTER.search(rest)
if match:
after = match.group(0)
material = rest[0 : -len(after)]
else:
after = ""
material = rest
return (" " if before else "", material, " " if after else "")
def tokenize(line):
tokens = []
for word in line.split():
ts = (
[[word, ""]]
if NUMBER_RE.match(word)
else [[t, ""] for t in TOKEN_RE.findall(word)]
)
if len(ts):
ts[-1][-1] = " "
tokens.extend(ts)
if len(tokens):
tokens[-1][-1] = ""
return tuple(tokens)
def repTokens(tokens):
text = []
for t, space in tokens:
text.append(f"‹{t}›{space}")
return "".join(text)
def checkModel(kind, thisModel, verbose):
modelDefault = (
LINE_MODEL_DEFAULT
if kind == LINE
else PAGE_MODEL_DEFAULT if kind == PAGE else SECTION_MODEL_DEFAULT
)
modelSpecs = (
LINE_MODELS if kind == LINE else PAGE_MODELS if kind == PAGE else SECTION_MODELS
)
if thisModel is None:
model = modelDefault
if verbose == 1:
console(f"WARNING: No {kind} model specified. Assuming model {model}.")
properties = {k: v[1] for (k, v) in modelSpecs[model].items()}
return dict(model=model, properties=properties)
if type(thisModel) is str:
if thisModel in modelSpecs:
thisModel = dict(model=thisModel)
else:
console(f"ERROR: unknown {kind} model: {thisModel}")
return False
elif type(thisModel) is not dict:
console(f"ERROR: {kind} model must be a dict. You passed a {type(thisModel)}")
return False
model = thisModel.get("model", None)
if model is None:
model = modelDefault
if verbose == 1:
console(f"WARNING: No {kind} model specified. Assuming model {model}.")
thisModel["model"] = model
if model not in modelSpecs:
console(f"WARNING: unknown {kind} model: {thisModel}")
return False
if verbose >= 0:
console(f"{kind} model is {model}")
properties = {k: v for (k, v) in thisModel.items() if k != "model"}
modelProperties = modelSpecs[model]
good = True
delKeys = []
for k, v in properties.items():
if k not in modelProperties:
console(f"WARNING: ignoring unknown {kind} model property {k}={v}")
delKeys.append(k)
elif type(v) is not modelProperties[k][0]:
console(
f"ERROR: {kind} property {k} should have type {modelProperties[k][0]}"
f" but {v} has type {type(v)}"
)
good = False
if good:
for k in delKeys:
del properties[k]
for k, v in modelProperties.items():
if k not in properties:
if verbose == 1:
console(
f"WARNING: {kind} model property {k} not specified, "
f"taking default {v[1]}"
)
properties[k] = v[1]
if not good:
return False
return dict(model=model, properties=properties)
def matchModel(properties, tag, atts):
if tag == properties["element"]:
criticalAtts = properties["attributes"]
match = True
for k, cVal in criticalAtts.items():
aVal = atts.get(k, None)
thisNoMatch = (
all(aVal != cV for cV in cVal)
if type(cVal) in {list, tuple, set}
else aVal != cVal
)
if thisNoMatch:
match = False
break
return match
def setUp(kind):
helpText = f"""
Convert {kind} to TF.
There are also commands to check the {kind} and to load the resulting TF."""
taskSpec = dict(
check="reports on the elements in the source",
convert=f"converts {kind} to TF",
load="loads the generated TF",
app="configures the TF app for the result",
apptoken="modifies the TF app to make it token- instead of character-based",
browse="starts the TF browser on the result",
)
paramSpec = {
"tf": (
(
"0 or latest: update latest version;\n\t\t"
"1 2 3: increase major, intermediate, minor TF version;\n\t\t"
"rest: explicit version."
),
"latest",
),
"sourceBase": (
(
"empty: refDir/{kind.lower()};\n\t\t"
"any directory of choice."
),
"",
),
"reportDir": (
(
"empty: refDir/report;\n\t\t"
"any directory of choice."
),
"",
),
kind.lower(): (
(
"0 or latest: latest version;\n\t\t"
"-1 -2 etc: previous version, before previous, ...;\n\t\t"
"1 2 etc: first version, second version, ...;\n\t\t"
"rest: explicit version."
),
"latest",
),
"validate": ("Whether to validate the XML input", True),
}
flagSpec = dict(
verbose=("Produce less or more progress and reporting messages", -1, 3),
doc=("Do only this document", None, 0),
)
return (helpText, taskSpec, paramSpec, flagSpec)
def tweakTrans(
template,
procins,
wordAsSlot,
tokenAsSlot,
charAsSlot,
parentEdges,
siblingEdges,
tokenBased,
sectionModel,
sectionProperties,
rendDesc,
extra,
):
if wordAsSlot:
slot = WORD
slotc = "Word"
slotf = "words"
xslot = "`word`"
elif charAsSlot:
slotc = "Char"
slot = CHAR
slotf = "characters"
xslot = "`char` and `word`"
elif tokenAsSlot or True:
slotc = "Token"
slot = T
slotf = "tokens"
xslot = "`t` and `word`"
if parentEdges:
hasParent = "Yes"
else:
hasParent = "No"
if siblingEdges:
hasSibling = "Yes"
else:
hasSibling = "No"
if tokenBased:
slot = TOKEN
slotc = "Token"
slotf = "tokens"
xslot = "`token`"
tokenGen = dedent(
"""
Tokens and sentence boundaries have been generated by a Natural Language
Pipeline, such as Spacy.
"""
)
tokenWord = "token"
hasToken = "Yes"
else:
tokenGen = ""
tokenWord = "word"
hasToken = "No"
if extra:
hasExtra = "Yes"
else:
hasExtra = "No"
if procins:
doProcins = "Yes"
else:
doProcins = "No"
levelNames = sectionProperties["levels"]
if sectionModel == "II":
nLevels = "2"
chapterSection = levelNames[0]
chunkSection = levelNames[1]
head = sectionProperties["element"]
attributes = sectionProperties["attributes"]
propertiesRaw = repr(sectionProperties)
properties = (
"".join(
f"\t*\t`{att}` = `{val}`\n" for (att, val) in sorted(attributes.items())
)
if attributes
else "\t*\t*no attribute properties*\n"
)
else:
nLevels = "3"
folderSection = levelNames[0]
fileSection = levelNames[1]
chunkSection = levelNames[2]
rendDescStr = "\n".join(
f"`{val}` | {desc}" for (val, desc) in sorted(rendDesc.items())
)
modelKeepRe = re.compile(rf"«(?:begin|end)Model{sectionModel}»")
modelRemoveRe = re.compile(r"«beginModel([^»]+)».*?«endModel\1»", re.S)
slotKeepRe = re.compile(rf"«(?:begin|end)Slot{slot}»")
slotRemoveRe = re.compile(r"«beginSlot([^»]+)».*?«endSlot\1»", re.S)
tokenKeepRe = re.compile(rf"«(?:begin|end)Token{hasToken}»")
tokenRemoveRe = re.compile(r"«beginToken([^»]+)».*?«endToken\1»", re.S)
parentKeepRe = re.compile(rf"«(?:begin|end)Parent{hasParent}»")
parentRemoveRe = re.compile(r"«beginParent([^»]+)».*?«endParent\1»", re.S)
siblingKeepRe = re.compile(rf"«(?:begin|end)Sibling{hasSibling}»")
siblingRemoveRe = re.compile(r"«beginSibling([^»]+)».*?«endSibling\1»", re.S)
extraKeepRe = re.compile(rf"«(?:begin|end)Extra{hasExtra}»")
extraRemoveRe = re.compile(r"«beginExtra([^»]+)».*?«endToken\1»", re.S)
procinsKeepRe = re.compile(rf"«(?:begin|end)Procins{doProcins}»")
procinsRemoveRe = re.compile(r"«beginProcins([^»]+)».*?«endToken\1»", re.S)
skipVars = re.compile(r"«[^»]+»")
text = (
template.replace("«slot»", slot)
.replace("«Slot»", slotc)
.replace("«slotf»", slotf)
.replace("«char and word»", xslot)
.replace("«tokenWord»", tokenWord)
.replace("«token generation»", tokenGen)
.replace("«nLevels»", nLevels)
.replace("«sectionModel»", sectionModel)
.replace("«rendDesc»", rendDescStr)
.replace("«extraFeatures»", extra)
)
if sectionModel == "II":
text = (
text.replace("«head»", head)
.replace("«properties»", properties)
.replace("«propertiesRaw»", propertiesRaw)
.replace("«chapter»", chapterSection)
.replace("«chunk»", chunkSection)
)
else:
text = (
text.replace("«folder»", folderSection)
.replace("«file»", fileSection)
.replace("«chunk»", chunkSection)
)
text = parentKeepRe.sub("", text)
text = parentRemoveRe.sub("", text)
text = siblingKeepRe.sub("", text)
text = siblingRemoveRe.sub("", text)
text = tokenKeepRe.sub("", text)
text = tokenRemoveRe.sub("", text)
text = modelKeepRe.sub("", text)
text = modelRemoveRe.sub("", text)
text = slotKeepRe.sub("", text)
text = slotRemoveRe.sub("", text)
text = extraKeepRe.sub("", text)
text = extraRemoveRe.sub("", text)
text = procinsKeepRe.sub("", text)
text = procinsRemoveRe.sub("", text)
text = skipVars.sub("", text)
if extra:
text += dedent(
f"""
# Additional features
{extra}
"""
)
return text
def lookupSource(cv, cur, tokenAsSlot, specs):
"""Looks up information from the current XML stack.
The current XML stack contains the ancestry of the current node, including
the current node itself.
It is a list of components, corresponding to the path from the root node to the
current node.
Each component is a tuple, consisting of the tag name and the attributes of
an XML node.
Against this stack a sequence of instructions, given in `specs`, is executed.
These instructions collect information from the stack, under certain conditions,
and put that information into a feature, as value for a certain node.
Here is an example of a single instruction:
Parameters
----------
cv: object
The converter object, needed to issue actions.
cur: dict
Various pieces of data collected during walking
and relevant for some next steps in the walk.
specs: tuple
A sequence of instructions what to look for.
Each instruction has the following parts:
* `pathSpec`
* `nodeType`
* `featureName`
The effect is:
The `pathSpec` is compared to the current XML stack.
If it matches the current node, the text content of the current node or one of
its attributes will be collected and put in a feature with name
`featureName`, for the current TF node of type `nodeType`.
The `pathSpec` is a list of components.
The first component should match the top of the XML stack, the second
component the element that is below the top, etc.
Each component is a tuple of
* a tag name;
* a dictionary of attribute values;
The first component may have a tag name that has `@` plus an attribute name
appended to it. That means that the information will be extracted from
that attribute, not from the content of the element.
"""
nest = cur[XNEST]
nNest = len(nest)
for path, nodeType, feature in specs:
nPath = len(path)
if nPath > nNest:
continue
ok = True
extractAttr = None
for p, (lookForTag, lookForAtts) in enumerate(path):
(compareToTag, compareToAtts) = nest[-(p + 1)]
if p == 0:
pieces = lookForTag.split("@", 1)
if len(pieces) == 2:
(lookForTag, extractAttr) = pieces
else:
extractAttr = None
ok = compareToTag == lookForTag
if not ok:
break
if lookForAtts is not None:
for att, val in lookForAtts.items():
if att not in compareToAtts or compareToAtts[att] != val:
ok = False
break
if not ok:
break
if not ok:
continue
targetNode = cur[NODE][nodeType]
sourceNode = cur[TNEST][-1]
slots = cv.linked(sourceNode)
sourceText = (
(
"".join(
cv.get("str", (T, slot)) + cv.get("after", (T, slot))
for slot in slots
)
if tokenAsSlot
else "".join(cv.get("ch", (CHAR, slot)) for slot in slots)
)
if extractAttr is None
else cv.get(extractAttr, sourceNode)
)
sourceText = (sourceText or "").strip()
source = {feature: sourceText}
cv.feature(targetNode, **source)
def parseIIIF(settings, prod, selector):
"""Parse the iiif yaml file.
We fill in the parameters.
"""
def applySwitches(prod, constants, switches):
for k, v in switches["prod" if prod else "dev"].items():
constants[k] = v
return constants
def substituteConstants(data, macros, constants):
tpd = type(data)
if tpd is str:
for k, v in macros.items():
pattern = f"<{k}>"
data = data.replace(pattern, str(v))
for k, v in constants.items():
pattern = f"«{k}»"
if type(v) is int and data == pattern:
data = v
break
else:
data = data.replace(pattern, str(v))
return data
if tpd is list:
return [substituteConstants(item, macros, constants) for item in data]
if tpd is dict:
return {
k: substituteConstants(v, macros, constants) for (k, v) in data.items()
}
return data
constants = applySwitches(prod, settings["constants"], settings["switches"])
macros = applySwitches(prod, settings["macros"], settings["switches"])
return AttrDict(
{
x: substituteConstants(xText, macros, constants)
for (x, xText) in settings[selector].items()
}
)
def operationalize(data):
scanInfo = {}
for extraFeat, featInfo in data.items():
for nodeType, info in featInfo.items():
variables = info["vars"]
value = info["value"]
newVars = {}
for name, val in variables.items():
if val.endswith("-1"):
newVal = val[0:-2]
shift = -1
elif val.endswith("+1"):
newVal = val[0:-2]
shift = 1
else:
newVal = val
shift = 0
feat = tuple(newVal.split(".", 1))
if len(feat) == 1:
parent = None
child = None
feat = feat[0]
else:
parent, feat = feat
if parent.startswith("-"):
child = parent[1:]
parent = None
else:
child = None
newVars[name] = (parent, child, feat, shift)
scanInfo.setdefault(nodeType, []).append((extraFeat, value, newVars))
return scanInfo
def fillinIIIF(data, **kwargs):
tpd = type(data)
if tpd is str:
for k, v in kwargs.items():
pattern = "{" + k + "}"
if type(v) is int and data == pattern:
data = v
break
else:
data = data.replace(pattern, str(v))
return data
if tpd is list:
return [fillinIIIF(item, **kwargs) for item in data]
if tpd is dict:
return {k: fillinIIIF(v, **kwargs) for (k, v) in data.items()}
return data
Global variables
var CM_LIT
-
The value is taken literally from a TEI attribute.
Code
tei
, since there is a 1-1 correspondence with the TEI source. var CM_LITC
-
The value is results from more intricate processing of material in the TEI.
More intricate means: we derive data that goes beyond pure XML syntax.
Examples:
- The values of the
rend
attributes are translated intorend_
value features; - Adding features
is_meta
(being inside the TEI-header) andis_note
(being inside a note); - The feature that gives the content of a (character) slot;
- Decomposing strings into words material and after-word material.
Code
tf
, since this is for the benefit of the resulting TF dataset. - The values of the
var CM_LITP
-
The value results from straightforward processing of material in the TEI.
Code
tei
, since there is a direct correspondence with the TEI source.Straightforward means: by taking into account the semantics of XML.
Examples:
- Generated white-space based on whether elements are pure or mixed;
- Edges between parent and child elements, or sibling elements.
var CM_NLP
-
The value is added by an NLP pipeline w.r.t. the material in the TEI.
Code
nlp
, since this comes from third party software.Examples:
- The feature
nsent
which gives the sentence number in the corpus. Sentences are not encoded in the TEI, but detected by an NLP program such as Spacy.
- The feature
var CM_PROV
-
The value is added by the conversion to TF w.r.t. the material in the TEI.
Examples:
- Slots in empty elements, in order to anchor the element to the text sequence;
- Section levels, based on the folder and file that the TEI source is in;
- A section level within the TEI, defined from several elements and the way they are nested;
Code
tf
, since this is for the benefit of the resulting TF dataset. var CONVERSION_METHODS
-
Information about the conversion.
When we produce TF features, we specify a bit of information in the feature metadata as how we arrived at the specific value.
That information ends up in two keys:
conversionMethod
: with values any of:-
conversionCode
: the value is derived fromconversionMethod
by looking it up in this table. These values can be used to qualify the name of the attribute for further processing.For example, if you have a feature
n
that originates literally from the TEI, you could pass it on astei:n
.But if you have a feature
chapter
that is provided by the conversion, you could pass it on astf:chapter
.This passing on is a matter of other software, that takes the generated TF as input and processes it further, e.g. as annotations.
More methods and codes
The TEI conversion is customizable by providing your own methods to several hooks in the program. These hooks may generate extra features, which you can give metadata in the
tei.yaml
file next to thetei.py
file where you define the custom functions.It is advised to state appropriate values for the
conversionMethod
andconversionCode
fields of these features.Examples:
-
A feature
country
is derived from specific elements in the TEI Header, and defined for nodes of typeletter
. This happens in order to support the software of Team Text that shows the text on a webpage.In such a case you could define
- `conversionMethod="derived"
- `conversionCode="tt"
var SECTION_MODELS
-
Models for sections.
A section is a part of the corpus that is defined by a set of files, or by elements within a single TEI source file.
A model
var SECTION_MODEL_DEFAULT
-
Default model for sections.
Functions
def checkModel(kind, thisModel, verbose)
def fillinIIIF(data, **kwargs)
def getWhites(text)
def lookupSource(cv, cur, tokenAsSlot, specs)
-
Looks up information from the current XML stack.
The current XML stack contains the ancestry of the current node, including the current node itself.
It is a list of components, corresponding to the path from the root node to the current node. Each component is a tuple, consisting of the tag name and the attributes of an XML node.
Against this stack a sequence of instructions, given in
specs
, is executed. These instructions collect information from the stack, under certain conditions, and put that information into a feature, as value for a certain node.Here is an example of a single instruction:
Parameters
cv
:object
- The converter object, needed to issue actions.
cur
:dict
- Various pieces of data collected during walking and relevant for some next steps in the walk.
specs
:tuple
-
A sequence of instructions what to look for. Each instruction has the following parts:
pathSpec
nodeType
featureName
The effect is:
The
pathSpec
is compared to the current XML stack. If it matches the current node, the text content of the current node or one of its attributes will be collected and put in a feature with namefeatureName
, for the current TF node of typenodeType
.The
pathSpec
is a list of components. The first component should match the top of the XML stack, the second component the element that is below the top, etc. Each component is a tuple of- a tag name;
- a dictionary of attribute values;
The first component may have a tag name that has
@
plus an attribute name appended to it. That means that the information will be extracted from that attribute, not from the content of the element.
def matchModel(properties, tag, atts)
def operationalize(data)
def parseIIIF(settings, prod, selector)
-
Parse the iiif yaml file.
We fill in the parameters.
def repTokens(tokens)
def setUp(kind)
def tokenize(line)
def tweakTrans(template, procins, wordAsSlot, tokenAsSlot, charAsSlot, parentEdges, siblingEdges, tokenBased, sectionModel, sectionProperties, rendDesc, extra)