Module tf.ner.helpers
Auxiliary functions.
Expand source code Browse git
"""Auxiliary functions.
"""
import re
import unicodedata
WHITE_STRIP_RE = re.compile(r"""(?:\s|[])+""")
def xstrip(x):
return WHITE_STRIP_RE.sub("", x)
WHITE_RE = re.compile(r"""\s+""", re.S)
NON_WORD = re.compile(r"""\W+""", re.S)
LOC_RE = re.compile(r"[.:@]")
PART_CUT_OFF = 8
"""Maximum length of parts of entity identifiers."""
PREFIX_PART = 5
SUFFIX_PART = PART_CUT_OFF - PREFIX_PART - 1
CUT_OFF = 40
"""Maximum length of entity identifiers."""
TOKEN_RE = re.compile(r"""\w+|\W""")
TO_ASCII_DEF = dict(
ñ="n",
ø="o",
ç="c",
)
"""Undecomposable UNICODE characters mapped to their related ASCII characters."""
TO_ASCII = {}
for u, a in TO_ASCII_DEF.items():
TO_ASCII[u] = a
TO_ASCII[u.upper()] = a.upper()
def normalize(text):
"""Normalize white-space in a text."""
return WHITE_RE.sub(" ", text).strip()
def toTokens(text, spaceEscaped=False, caseSensitive=False):
"""Split a text into tokens.
The text is split on white-space.
Tokens are further split into maximal segments of word characters
and individual non-word characters.
Parameters
----------
spaceEscaped: boolean, optional False
If True, it is assumed that if a `_` occurs in a token string, a space is meant.
caseSensitive: boolean, optional False
If True, return all tokens in lower case.
Returns
-------
tuple of string
The sequence of tokens to which the text has decomposed
"""
result = TOKEN_RE.findall(normalize(text))
result = tuple((t.replace("_", " ") for t in result) if spaceEscaped else result)
if not caseSensitive:
result = tuple(t.lower() for t in result)
return tuple(t for t in result if t != " ")
def fromTokens(tokens, spaceEscaped=False):
"""The inverse of `toTokens()`.
Doing first toTokens and then fromTokens is idempotent (provided you do it in
a case-sensitive way).
So if you have to revert back from tokens to text,
make sure that you have done a combo of toTokens and
fromTokens first.
You can use `tnorm()` for that.
Parameters
----------
spaceEscaped: boolean, optional False
If True, it all ` ` in token strings will be escaped as `-`
caseSensitive: boolean, optional False
If True, return all tokens in lower case.
Returns
-------
tuple of string
The sequence of tokens to which the text has decomposed
"""
return " ".join(
tuple(t.replace(" ", "_") for t in tokens) if spaceEscaped else tokens
)
def tnorm(text, spaceEscaped=False, caseSensitive=False):
"""Normalize text.
Split a text into tokens and then recombine to text again.
This will result in a normalized version of the text with respect to whitespace.
Parameters
----------
spaceEscaped: boolean, optional False
If your corpus has tokens with spaces in it, pass True, otherwise False.
caseSensitive: boolean, optional False
If True, case sensitivity is preserved, otherwise all uppercase will be
converted to lowercase.
Returns
-------
string
"""
return fromTokens(
toTokens(text, spaceEscaped=spaceEscaped, caseSensitive=caseSensitive),
spaceEscaped=spaceEscaped,
)
def toAscii(text):
"""Transforms a text with diacritical marks into a plain ASCII text.
Characters with diacritics are replaced by their base character.
Some characters with diacritics are considered by UNICODE to be undecomposable
characters, such as `ø` and `ñ`.
We use a table (`TO_ASCII_DEF`) to map these on their related ASCII characters.
Parameters
----------
text: string
The text to be translated
Returns
-------
string
The translated text.
"""
return "".join(
TO_ASCII.get(c, c)
for c in unicodedata.normalize("NFD", text)
if unicodedata.category(c) != "Mn"
)
def toId(text):
"""Transforms text to an identifier string.
Tokens are lower-cased, separated by `.`, reduced to ASCII.
Parameters
----------
text: string
The text to be transformed
Returns
-------
The identifier string based on `text`
"""
return NON_WORD.sub(".", toAscii(text.lower())).strip(".")
def toSmallId(text, transform={}):
"""Transforms text to a smaller identifier string.
As `toId()`, but now certain parts of the resulting identifier are
either left out or replaced by shorter strings.
This transformation is defined by the `transform` dictionary,
which ultimately is provided in the corpus-dependent
`ner/config.yaml` .
Parameters
----------
text: string
The text to be transformed
transform: dict, optional {}
Custom transformations to be applied; usually this is the omission of frequent
non-content words in the language.
Returns
-------
The identifier string based on `text`
"""
eid = toId(text)
if len(eid) <= CUT_OFF:
return eid
parts = [y for x in eid.split(".") if (y := transform.get(x, x))]
result = []
n = 0
for part in parts:
if len(part) > PART_CUT_OFF:
part = part[0:PREFIX_PART] + "~" + part[-SUFFIX_PART:]
nPart = len(part)
result.append(part)
n += nPart
if n > CUT_OFF:
break
return ".".join(result)
def findCompile(bFind, bFindC):
"""Compiles a regular expression out of a search pattern.
Parameters
----------
bFind: string
The search pattern as a plain string.
bFindC: boolean
Whether the search is case-sensitive.
Returns
-------
tuple
the white-space-stripped search pattern;
the regular expression object, if successful, otherwise None;
the error message if the re-compilation was not successful.
"""
bFind = (bFind or "").strip()
bFindFlag = [] if bFindC else [re.I]
bFindRe = None
errorMsg = ""
if bFind:
try:
bFindRe = re.compile(bFind, *bFindFlag)
except Exception as e:
errorMsg = str(e)
return (bFind, bFindRe, errorMsg)
Global variables
var CUT_OFF
-
Maximum length of entity identifiers.
var PART_CUT_OFF
-
Maximum length of parts of entity identifiers.
var TO_ASCII_DEF
-
Undecomposable UNICODE characters mapped to their related ASCII characters.
Functions
def findCompile(bFind, bFindC)
-
Compiles a regular expression out of a search pattern.
Parameters
bFind
:string
- The search pattern as a plain string.
bFindC
:boolean
- Whether the search is case-sensitive.
Returns
tuple
- the white-space-stripped search pattern; the regular expression object, if successful, otherwise None; the error message if the re-compilation was not successful.
def fromTokens(tokens, spaceEscaped=False)
-
The inverse of
toTokens()
.Doing first toTokens and then fromTokens is idempotent (provided you do it in a case-sensitive way). So if you have to revert back from tokens to text, make sure that you have done a combo of toTokens and fromTokens first. You can use
tnorm()
for that.Parameters
spaceEscaped
:boolean
, optionalFalse
- If True, it all
-
caseSensitive
:boolean
, optionalFalse
- If True, return all tokens in lower case.
Returns
tuple
ofstring
- The sequence of tokens to which the text has decomposed
def normalize(text)
-
Normalize white-space in a text.
def tnorm(text, spaceEscaped=False, caseSensitive=False)
-
Normalize text.
Split a text into tokens and then recombine to text again. This will result in a normalized version of the text with respect to whitespace.
Parameters
spaceEscaped
:boolean
, optionalFalse
- If your corpus has tokens with spaces in it, pass True, otherwise False.
caseSensitive
:boolean
, optionalFalse
- If True, case sensitivity is preserved, otherwise all uppercase will be converted to lowercase.
Returns
string
def toAscii(text)
-
Transforms a text with diacritical marks into a plain ASCII text.
Characters with diacritics are replaced by their base character. Some characters with diacritics are considered by UNICODE to be undecomposable characters, such as
ø
andñ
. We use a table (TO_ASCII_DEF
) to map these on their related ASCII characters.Parameters
text
:string
- The text to be translated
Returns
string
- The translated text.
def toId(text)
-
Transforms text to an identifier string.
Tokens are lower-cased, separated by
.
, reduced to ASCII.Parameters
text
:string
- The text to be transformed
Returns
The identifier string based on <code>text</code>
def toSmallId(text, transform={})
-
Transforms text to a smaller identifier string.
As
toId()
, but now certain parts of the resulting identifier are either left out or replaced by shorter strings.This transformation is defined by the
transform
dictionary, which ultimately is provided in the corpus-dependentner/config.yaml
.Parameters
text
:string
- The text to be transformed
transform
:dict
, optional{}
- Custom transformations to be applied; usually this is the omission of frequent non-content words in the language.
Returns
The identifier string based on <code>text</code>
def toTokens(text, spaceEscaped=False, caseSensitive=False)
-
Split a text into tokens.
The text is split on white-space. Tokens are further split into maximal segments of word characters and individual non-word characters.
Parameters
spaceEscaped
:boolean
, optionalFalse
- If True, it is assumed that if a
_
occurs in a token string, a space is meant. caseSensitive
:boolean
, optionalFalse
- If True, return all tokens in lower case.
Returns
tuple
ofstring
- The sequence of tokens to which the text has decomposed
def xstrip(x)