Module tf.browser.ner.match
Match functions.
To see how this fits among all the modules of this package, see
tf.browser.ner.annotate
.
Expand source code Browse git
"""Match functions.
To see how this fits among all the modules of this package, see
`tf.browser.ner.annotate` .
"""
from .settings import NONE
def occMatch(getTokens, b, qTokenSet, results):
"""Finds the occurrences of multiple sequences of tokens in a single bucket.
Parameters
----------
getTokens: function
See `tf.browser.ner.corpus.Corpus.getTokens`
b: integer
The node of the bucket in question
qTokenSet: set, optional set()
A set of sequences of tokens. Each sequence in the set will be used as a
search pattern, and it occurrences in the bucket are collected.
result: dict
A dictionary to collect the results in.
Keyed by each member of parameter `qTokenSet` the values are
the occurrences of that member in the corpus.
A single occurrence is represented as a tuple of slots.
"""
bTokensAll = getTokens(b)
bTokens = [x for x in bTokensAll if (x[1] or '').strip()]
bStrings = {s for (t, s) in bTokens}
nBTokens = len(bTokens)
for qTokens in qTokenSet:
if any(s not in bStrings for s in qTokens):
continue
nTokens = len(qTokens)
for i, (t, w) in enumerate(bTokens):
if w != qTokens[0]:
continue
if i + nTokens - 1 >= nBTokens:
break
match = True
for j, w in enumerate(qTokens[1:]):
if bTokens[i + j + 1][1] != w:
match = False
break
if match:
lastT = bTokens[i + nTokens - 1][0]
slots = tuple(range(t, lastT + 1))
results.setdefault(qTokens, []).append(slots)
def entityMatch(
entityIndex,
eStarts,
entitySlotVal,
entitySlotAll,
entitySlotIndex,
getTextR,
getTokens,
b,
bFindRe,
anyEnt,
eVals,
qTokens,
valSelect,
freeState,
fValStats,
):
"""Checks whether a bucket satisfies a variety of criteria.
When we do the checking, we ignore empty tokens in the bucket.
Parameters
----------
entityIndex, eStarts, entitySlotVal, entitySlotAll, entitySlotIndex: object
Various kinds of processed entity data, see `tf.browser.ner.data`
getTextR: function
See `tf.browser.ner.corpus.Corpus.getTextR`
getTokens: function
See `tf.browser.ner.corpus.Corpus.getTokens`
b: integer
The node of the bucket in question
bFindRe, anyEnt, eVals, qTokens, valSelect, freeState: object
As in `tf.browser.ner.annotate.Annotate.filterContent`
Returns
-------
tuple
Members:
* `fits`: boolean, whether the bucket passes the filter
* `(tokens, matches, positions)`:
* `tokens` all tokens of the bucket, each token is a tuple consisting
of its slot number (position) and string value;
* `matches`: a list of the positions of the found occurrences for the
`qTokens` and / or `eVals` in the bucket;
* `positions`: a set of positions in the bucket where the
`bFindRe` starts to match;
"""
positions = set()
fits = None
if bFindRe:
fits = False
sText = getTextR(b)
for match in bFindRe.finditer(sText):
positions |= set(range(match.start(), match.end()))
fits = True
bTokensAll = getTokens(b)
bTokens = [x for x in bTokensAll if (x[1] or "").strip()]
if fits is None or fits:
if anyEnt is not None:
containsEntities = False
for i, (t, w) in enumerate(bTokens):
if t in entitySlotAll:
containsEntities = True
break
fits = anyEnt and containsEntities or not anyEnt and not containsEntities
matches = []
if eVals is not None:
for i, (t, w) in enumerate(bTokens):
lastT = eStarts.get(t, None)
if lastT is None:
continue
slots = tuple(range(t, lastT + 1))
if freeState is None:
freeOK = True
else:
bound = any(slot in entitySlotIndex for slot in slots)
freeOK = freeState and not bound or not freeState and bound
if not freeOK:
continue
for feat, stats in fValStats.items():
for val in eVals:
stats[val] += 1
valOK = True
for feat, val in zip(fValStats, eVals):
if valSelect is None:
continue
selectedVals = valSelect[feat]
if val not in selectedVals:
valOK = False
break
if valOK:
matches.append(slots)
elif qTokens is not None:
nTokens = len(qTokens)
if nTokens:
bStrings = {s for (t, s) in bTokens}
if any(s not in bStrings for s in qTokens):
return (fits, (bTokensAll, matches, positions))
nBTokens = len(bTokens)
for i, (t, w) in enumerate(bTokens):
if w != qTokens[0]:
continue
if i + nTokens - 1 >= nBTokens:
return (fits, (bTokensAll, matches, positions))
match = True
for j, w in enumerate(qTokens[1:]):
if bTokens[i + j + 1][1] != w:
match = False
break
if match:
lastT = bTokens[i + nTokens - 1][0]
slots = tuple(range(t, lastT + 1))
if freeState is None:
freeOK = True
else:
bound = any(slot in entitySlotIndex for slot in slots)
freeOK = freeState and not bound or not freeState and bound
if not freeOK:
continue
for feat, stats in fValStats.items():
vals = entityIndex[feat].get(slots, set())
if len(vals) == 0:
stats[NONE] += 1
else:
for val in vals:
stats[val] += 1
valTuples = entitySlotVal.get(slots, set())
if len(valTuples):
valOK = False
if valSelect is not None:
for valTuple in valTuples:
thisOK = True
for feat, val in zip(fValStats, valTuple):
selectedVals = valSelect[feat]
if val not in selectedVals:
thisOK = False
break
if thisOK:
valOK = True
break
else:
valOK = valSelect is None or all(
NONE in valSelect[feat] for feat in fValStats
)
if valOK:
matches.append(slots)
else:
return (fits, (bTokensAll, matches, positions))
return (fits, (bTokensAll, matches, positions))
Functions
def entityMatch(entityIndex, eStarts, entitySlotVal, entitySlotAll, entitySlotIndex, getTextR, getTokens, b, bFindRe, anyEnt, eVals, qTokens, valSelect, freeState, fValStats)
-
Checks whether a bucket satisfies a variety of criteria.
When we do the checking, we ignore empty tokens in the bucket.
Parameters
entityIndex
,eStarts
,entitySlotVal
,entitySlotAll
,entitySlotIndex
:object
- Various kinds of processed entity data, see
tf.browser.ner.data
getTextR
:function
- See
Corpus.getTextR
getTokens
:function
- See
Corpus.getTokens
b
:integer
- The node of the bucket in question
bFindRe
,anyEnt
,eVals
,qTokens
,valSelect
,freeState
:object
- As in
Annotate.filterContent()
Returns
tuple
-
Members:
fits
: boolean, whether the bucket passes the filter(tokens, matches, positions)
:tokens
all tokens of the bucket, each token is a tuple consisting of its slot number (position) and string value;matches
: a list of the positions of the found occurrences for theqTokens
and / oreVals
in the bucket;positions
: a set of positions in the bucket where thebFindRe
starts to match;
Expand source code Browse git
def entityMatch( entityIndex, eStarts, entitySlotVal, entitySlotAll, entitySlotIndex, getTextR, getTokens, b, bFindRe, anyEnt, eVals, qTokens, valSelect, freeState, fValStats, ): """Checks whether a bucket satisfies a variety of criteria. When we do the checking, we ignore empty tokens in the bucket. Parameters ---------- entityIndex, eStarts, entitySlotVal, entitySlotAll, entitySlotIndex: object Various kinds of processed entity data, see `tf.browser.ner.data` getTextR: function See `tf.browser.ner.corpus.Corpus.getTextR` getTokens: function See `tf.browser.ner.corpus.Corpus.getTokens` b: integer The node of the bucket in question bFindRe, anyEnt, eVals, qTokens, valSelect, freeState: object As in `tf.browser.ner.annotate.Annotate.filterContent` Returns ------- tuple Members: * `fits`: boolean, whether the bucket passes the filter * `(tokens, matches, positions)`: * `tokens` all tokens of the bucket, each token is a tuple consisting of its slot number (position) and string value; * `matches`: a list of the positions of the found occurrences for the `qTokens` and / or `eVals` in the bucket; * `positions`: a set of positions in the bucket where the `bFindRe` starts to match; """ positions = set() fits = None if bFindRe: fits = False sText = getTextR(b) for match in bFindRe.finditer(sText): positions |= set(range(match.start(), match.end())) fits = True bTokensAll = getTokens(b) bTokens = [x for x in bTokensAll if (x[1] or "").strip()] if fits is None or fits: if anyEnt is not None: containsEntities = False for i, (t, w) in enumerate(bTokens): if t in entitySlotAll: containsEntities = True break fits = anyEnt and containsEntities or not anyEnt and not containsEntities matches = [] if eVals is not None: for i, (t, w) in enumerate(bTokens): lastT = eStarts.get(t, None) if lastT is None: continue slots = tuple(range(t, lastT + 1)) if freeState is None: freeOK = True else: bound = any(slot in entitySlotIndex for slot in slots) freeOK = freeState and not bound or not freeState and bound if not freeOK: continue for feat, stats in fValStats.items(): for val in eVals: stats[val] += 1 valOK = True for feat, val in zip(fValStats, eVals): if valSelect is None: continue selectedVals = valSelect[feat] if val not in selectedVals: valOK = False break if valOK: matches.append(slots) elif qTokens is not None: nTokens = len(qTokens) if nTokens: bStrings = {s for (t, s) in bTokens} if any(s not in bStrings for s in qTokens): return (fits, (bTokensAll, matches, positions)) nBTokens = len(bTokens) for i, (t, w) in enumerate(bTokens): if w != qTokens[0]: continue if i + nTokens - 1 >= nBTokens: return (fits, (bTokensAll, matches, positions)) match = True for j, w in enumerate(qTokens[1:]): if bTokens[i + j + 1][1] != w: match = False break if match: lastT = bTokens[i + nTokens - 1][0] slots = tuple(range(t, lastT + 1)) if freeState is None: freeOK = True else: bound = any(slot in entitySlotIndex for slot in slots) freeOK = freeState and not bound or not freeState and bound if not freeOK: continue for feat, stats in fValStats.items(): vals = entityIndex[feat].get(slots, set()) if len(vals) == 0: stats[NONE] += 1 else: for val in vals: stats[val] += 1 valTuples = entitySlotVal.get(slots, set()) if len(valTuples): valOK = False if valSelect is not None: for valTuple in valTuples: thisOK = True for feat, val in zip(fValStats, valTuple): selectedVals = valSelect[feat] if val not in selectedVals: thisOK = False break if thisOK: valOK = True break else: valOK = valSelect is None or all( NONE in valSelect[feat] for feat in fValStats ) if valOK: matches.append(slots) else: return (fits, (bTokensAll, matches, positions)) return (fits, (bTokensAll, matches, positions))
def occMatch(getTokens, b, qTokenSet, results)
-
Finds the occurrences of multiple sequences of tokens in a single bucket.
Parameters
getTokens
:function
- See
Corpus.getTokens
b
:integer
- The node of the bucket in question
qTokenSet
:set
, optionalset()
- A set of sequences of tokens. Each sequence in the set will be used as a search pattern, and it occurrences in the bucket are collected.
result
:dict
- A dictionary to collect the results in.
Keyed by each member of parameter
qTokenSet
the values are the occurrences of that member in the corpus. A single occurrence is represented as a tuple of slots.
Expand source code Browse git
def occMatch(getTokens, b, qTokenSet, results): """Finds the occurrences of multiple sequences of tokens in a single bucket. Parameters ---------- getTokens: function See `tf.browser.ner.corpus.Corpus.getTokens` b: integer The node of the bucket in question qTokenSet: set, optional set() A set of sequences of tokens. Each sequence in the set will be used as a search pattern, and it occurrences in the bucket are collected. result: dict A dictionary to collect the results in. Keyed by each member of parameter `qTokenSet` the values are the occurrences of that member in the corpus. A single occurrence is represented as a tuple of slots. """ bTokensAll = getTokens(b) bTokens = [x for x in bTokensAll if (x[1] or '').strip()] bStrings = {s for (t, s) in bTokens} nBTokens = len(bTokens) for qTokens in qTokenSet: if any(s not in bStrings for s in qTokens): continue nTokens = len(qTokens) for i, (t, w) in enumerate(bTokens): if w != qTokens[0]: continue if i + nTokens - 1 >= nBTokens: break match = True for j, w in enumerate(qTokens[1:]): if bTokens[i + j + 1][1] != w: match = False break if match: lastT = bTokens[i + nTokens - 1][0] slots = tuple(range(t, lastT + 1)) results.setdefault(qTokens, []).append(slots)