Module tf.tools.myspacy

Get words and tokens from a plain text with the help of Spacy.

This module supposes that you have installed Spacy and the necessary language modules.

To get Spacy, do

pip install spacy

The Enlish language module can then be installed by

python -m spacy download en_core_web_sm

You can install many more language models.

Expand source code Browse git
"""Get words and tokens from a plain text with the help of Spacy.

This module supposes that you have installed Spacy and the necessary
language modules.

To get [Spacy](https://spacy.io), do

```
pip install spacy
```

The Enlish language module can then be installed by

```
python -m spacy download en_core_web_sm
```

You can install many more [language models](https://spacy.io/usage/models).
"""

import re
import spacy
from spacy.cli.download import download

from ..core.helpers import console


LANG_MODELS = """
ca core_news Catalan
da core_news Danish
de core_news German
el core_news Greek
en core_web English
es core_news Spanish
fi core_news Finnish
fr core_news French
hr core_news Croatian
it core_news Italian
ja core_news Japanese
ko core_news Korean
lt core_news Lithuanian
mk core_news Macedonian
nb core_news Norwegian (Bokmål)
nl core_news Dutch
pl core_news Polish
pt core_news Portuguese
ro core_news Romanian
ru core_news Russian
sv core_news Swedish
uk core_news Ukrainian
zh core_news Chinese
xx ent_wiki multi-language
""".strip().split("\n")
"""Languages and their associated Spacy models."""


class Spacy:
    def __init__(self, lang=None, parser=False):
        """Sets up an NLP (Natural Language Processing) pipeline.

        The pipeline is tied to a particular language model, which you can pass
        as a parameter, provided you have installed it.

        For now, we use Spacy in a fairly trivial way: only tokenization and sentence
        detection.
        We do not need the parser for this.

        Parameters
        ----------
        lang: string, optional xx
            The language to be used; Spacy may need to download it, if so, it will
            happen automatically.

            If the language is not supported by Spacy, we switch to the multilanguage
            called `xx`.

            See `tf.tools.myspacy.LANG_MODELS` about the language models that Spacy supports.
        """
        langModels = {}
        languages = {}

        for spec in LANG_MODELS:
            (lng, model, language) = spec.split(maxsplit=2)
            langModels[lng] = f"{lng}_{model}_sm"
            languages[lng] = language

        self.langModels = langModels
        self.languages = languages

        prevLang = None
        targetLang = lang
        loaded = False

        i = 0
        while True:
            i += 1
            targetModel = langModels.get(targetLang, None)
            targetLanguage = languages.get(targetLang, None)

            if targetModel is None:
                (prevLang, targetLang) = (targetLang, "xx")
                targetModel = langModels[targetLang]
                targetLanguage = languages[targetLang]

                if prevLang is None:
                    console("No language specified")
                else:
                    console(
                        f"No language model for {prevLang} supported by Spacy.\n"
                    )
                console(
                    f"Switching to the {targetLanguage} model"
                )
                if targetLang == prevLang:
                    break
                else:
                    continue

            try:
                nlp = spacy.load(targetModel)
                loaded = True
                break

            except Exception:
                console(f"Language model {targetModel} not installed. Downloading ...")

            try:
                console(f"Downloading {targetModel} ...")
                download(targetModel)
            except Exception:
                console(f"Could not download {targetModel} ...")
                (prevLang, targetLang) = (targetLang, "xx")
                if targetLang == prevLang:
                    break
                else:
                    continue

        console(f"NLP with language model {targetLang} {parser=}")

        if loaded:
            try:
                if not parser:
                    nlp.disable_pipe("parser")
                nlp.disable_pipe("sentencizer")
            except Exception:
                pass
            try:
                nlp.enable_pipe("senter")
                self.canSentence = True
            except Exception:
                self.canSentence = False
                console("This language does not support sentence boundary detection")

            if parser:
                try:
                    nlp.enable_pipe("tagger")
                    self.canTag = True
                    console("This language supports tagging")
                except Exception:
                    self.canTag = False
                    console("This language does not supports tagging")
                try:
                    nlp.enable_pipe("morphologizer")
                    self.canMorph = True
                    console("This language supports morphologizing")
                except Exception:
                    self.canMorph = False
                    console("This language does not supports morphologizing")
                try:
                    nlp.enable_pipe("lemmatizer")
                    self.canLemma = True
                    console("This language supports lemmatizing")
                except Exception:
                    self.canLemma = False
                    console("This language does not supports lemmatizing")
        else:
            console("Cannot load (language data) to get Spacy working")
            nlp = None

        self.nlp = nlp
        self.doc = None

    def read(self, text):
        """Process a plain text.

        A text is ingested and tokenized. Sentences are detected.
        This may require quite some processing time, think of 30 seconds for 200,000
        words on a decent laptop.

        Parameters
        ----------
        text: string
            The complete, raw text.
        """
        nText = len(text)
        nlp = self.nlp

        if nlp is None:
            console("The NLP pipeline is not functioning")
            return

        nlp.max_length = nText
        doc = nlp(text)
        self.doc = doc

    def getTokens(self):
        """Get the resulting tokens.

        A token is represented as a tuple consisting of

        *   *start*: first character position that the token occupies in the text.
            Character positions start at 0.
        *   *end*: last character position that the token occupies in the text
            *plus one*.
        *   *text*: text of the token, **excluding any trailing whitespace**.
        *   *space*: any white space behind the token, if present, otherwise
            the empty string.


        !!! note "End position and space"
            If there is a space behind the token, it will not add to the end position
            of the token. So the start and end positions of the tokens reflect
            where the tokens themselves are, and spaces do nto belong to the tokens.

        Returns
        -------
        list
            All tokens as tuples.
        """
        doc = self.doc
        if doc is None:
            console("No results available from the NLP pipeline")
            return []

        canTag = self.canTag
        canMorph = self.canMorph
        canLemma = self.canLemma

        result = []

        for token in doc:
            start = token.idx
            text = token.text
            space = token.whitespace_
            end = start + len(text)

            pos = token.pos_ if canMorph else token.tag_ if canTag else None
            morph = str(token.morph) if canMorph else None
            lemma = token.lemma_.strip().lower() if canLemma else None
            result.append((start, end, text, space, pos, morph, lemma))

        return result

    def getSentences(self):
        """Get the resulting sentences.

        A sentence is represented as a tuple consisting of

        *   *start*: first character position that the sentence occupies in the text.
            Character positions start at 0.
        *   *end*: last character position that the sentence occupies in the text
            *plus one*.
        *   *text*: text of the sentence.

        Returns
        -------
        list
            All sentences as tuples.
        """
        doc = self.doc
        if doc is None:
            console("No results available from the NLP pipeline")
            return []

        if not self.canSentence:
            console("No sentence results available from the NLP pipeline")
            return []

        result = []

        whiteRe = re.compile(r"^[.?!\s]*$", re.S)
        spuriousNlBefore = re.compile(r"\n+(\W)")
        spuriousNlAfter = re.compile(r"(\W)\n+")

        for s in doc.sents:
            text = s.text.strip("\n")
            if whiteRe.match(text):
                continue

            tokenStart = doc[s.start]
            tokenEnd = doc[s.end - 1]
            sentStart = tokenStart.idx
            sentEnd = tokenEnd.idx + len(tokenEnd.text)

            text = spuriousNlBefore.sub(r"\1", text)
            text = spuriousNlAfter.sub(r"\1", text)
            result.append((sentStart, sentEnd, text))

        return result

    def getEntities(self):
        """Get the resulting named entities.

        A named entity is represented as a tuple consisting of

        *   *start*: first character position that the entity occupies in the text.
            Character positions start at 0.
        *   *end*: last character position that the entity occupies in the text
            *plus one*.
        *   *text*: text of the entity.
        *   *kind*: kind of the entity.

        Returns
        -------
        list
            All entities as tuples.
        """
        doc = self.doc
        if doc is None:
            console("No results available from the NLP pipeline")
            return []

        if not hasattr(doc, "ents"):
            console("No entity results available from the NLP pipeline")
            return []

        result = []

        for ent in doc.ents:
            start = ent.start_char
            end = ent.end_char
            text = ent.text
            kind = ent.label_
            result.append((start, end, text, kind))

        return result


def nlpOutput(text, lang="en", ner=False, parser=False):
    """Runs the Spacy NLP pipeline and delivers the results.

    Parameters
    ----------
    text: string
        The complete, raw text.
    lang: string, optional en
        The language to be used; its model should be installed; see
        `tf.tools.myspacy` for how to get language models.
    ner: boolean, optional False
        Whether to include named entities in the output.
    parser: boolean, optional False
        Whether to run the NLP parser.

    Returns
    -------
    tuple
        `tokens`: the token list as tuples
        `sentences`: the sentence list as tuples
        `entities`: the entity list as tuples, only if `ner=True`

        Tokens are tuples (start, end, text, after).

        Sentences are tuples (start, end, text).

        Entities are tuples (start, end, text, kind).
    """
    S = Spacy(lang=lang, parser=parser)
    S.read(text)

    tokens = S.getTokens()
    sentences = S.getSentences()
    entities = S.getEntities() if ner else None

    return tuple(x for x in (tokens, sentences, entities) if x is not None)

Global variables

var LANG_MODELS

Languages and their associated Spacy models.

Functions

def nlpOutput(text, lang='en', ner=False, parser=False)

Runs the Spacy NLP pipeline and delivers the results.

Parameters

text : string
The complete, raw text.
lang : string, optional en
The language to be used; its model should be installed; see tf.tools.myspacy for how to get language models.
ner : boolean, optional False
Whether to include named entities in the output.
parser : boolean, optional False
Whether to run the NLP parser.

Returns

tuple

tokens: the token list as tuples sentences: the sentence list as tuples entities: the entity list as tuples, only if ner=True

Tokens are tuples (start, end, text, after).

Sentences are tuples (start, end, text).

Entities are tuples (start, end, text, kind).

Expand source code Browse git
def nlpOutput(text, lang="en", ner=False, parser=False):
    """Runs the Spacy NLP pipeline and delivers the results.

    Parameters
    ----------
    text: string
        The complete, raw text.
    lang: string, optional en
        The language to be used; its model should be installed; see
        `tf.tools.myspacy` for how to get language models.
    ner: boolean, optional False
        Whether to include named entities in the output.
    parser: boolean, optional False
        Whether to run the NLP parser.

    Returns
    -------
    tuple
        `tokens`: the token list as tuples
        `sentences`: the sentence list as tuples
        `entities`: the entity list as tuples, only if `ner=True`

        Tokens are tuples (start, end, text, after).

        Sentences are tuples (start, end, text).

        Entities are tuples (start, end, text, kind).
    """
    S = Spacy(lang=lang, parser=parser)
    S.read(text)

    tokens = S.getTokens()
    sentences = S.getSentences()
    entities = S.getEntities() if ner else None

    return tuple(x for x in (tokens, sentences, entities) if x is not None)

Classes

class Spacy (lang=None, parser=False)

Sets up an NLP (Natural Language Processing) pipeline.

The pipeline is tied to a particular language model, which you can pass as a parameter, provided you have installed it.

For now, we use Spacy in a fairly trivial way: only tokenization and sentence detection. We do not need the parser for this.

Parameters

lang : string, optional xx

The language to be used; Spacy may need to download it, if so, it will happen automatically.

If the language is not supported by Spacy, we switch to the multilanguage called xx.

See LANG_MODELS about the language models that Spacy supports.

Expand source code Browse git
class Spacy:
    def __init__(self, lang=None, parser=False):
        """Sets up an NLP (Natural Language Processing) pipeline.

        The pipeline is tied to a particular language model, which you can pass
        as a parameter, provided you have installed it.

        For now, we use Spacy in a fairly trivial way: only tokenization and sentence
        detection.
        We do not need the parser for this.

        Parameters
        ----------
        lang: string, optional xx
            The language to be used; Spacy may need to download it, if so, it will
            happen automatically.

            If the language is not supported by Spacy, we switch to the multilanguage
            called `xx`.

            See `tf.tools.myspacy.LANG_MODELS` about the language models that Spacy supports.
        """
        langModels = {}
        languages = {}

        for spec in LANG_MODELS:
            (lng, model, language) = spec.split(maxsplit=2)
            langModels[lng] = f"{lng}_{model}_sm"
            languages[lng] = language

        self.langModels = langModels
        self.languages = languages

        prevLang = None
        targetLang = lang
        loaded = False

        i = 0
        while True:
            i += 1
            targetModel = langModels.get(targetLang, None)
            targetLanguage = languages.get(targetLang, None)

            if targetModel is None:
                (prevLang, targetLang) = (targetLang, "xx")
                targetModel = langModels[targetLang]
                targetLanguage = languages[targetLang]

                if prevLang is None:
                    console("No language specified")
                else:
                    console(
                        f"No language model for {prevLang} supported by Spacy.\n"
                    )
                console(
                    f"Switching to the {targetLanguage} model"
                )
                if targetLang == prevLang:
                    break
                else:
                    continue

            try:
                nlp = spacy.load(targetModel)
                loaded = True
                break

            except Exception:
                console(f"Language model {targetModel} not installed. Downloading ...")

            try:
                console(f"Downloading {targetModel} ...")
                download(targetModel)
            except Exception:
                console(f"Could not download {targetModel} ...")
                (prevLang, targetLang) = (targetLang, "xx")
                if targetLang == prevLang:
                    break
                else:
                    continue

        console(f"NLP with language model {targetLang} {parser=}")

        if loaded:
            try:
                if not parser:
                    nlp.disable_pipe("parser")
                nlp.disable_pipe("sentencizer")
            except Exception:
                pass
            try:
                nlp.enable_pipe("senter")
                self.canSentence = True
            except Exception:
                self.canSentence = False
                console("This language does not support sentence boundary detection")

            if parser:
                try:
                    nlp.enable_pipe("tagger")
                    self.canTag = True
                    console("This language supports tagging")
                except Exception:
                    self.canTag = False
                    console("This language does not supports tagging")
                try:
                    nlp.enable_pipe("morphologizer")
                    self.canMorph = True
                    console("This language supports morphologizing")
                except Exception:
                    self.canMorph = False
                    console("This language does not supports morphologizing")
                try:
                    nlp.enable_pipe("lemmatizer")
                    self.canLemma = True
                    console("This language supports lemmatizing")
                except Exception:
                    self.canLemma = False
                    console("This language does not supports lemmatizing")
        else:
            console("Cannot load (language data) to get Spacy working")
            nlp = None

        self.nlp = nlp
        self.doc = None

    def read(self, text):
        """Process a plain text.

        A text is ingested and tokenized. Sentences are detected.
        This may require quite some processing time, think of 30 seconds for 200,000
        words on a decent laptop.

        Parameters
        ----------
        text: string
            The complete, raw text.
        """
        nText = len(text)
        nlp = self.nlp

        if nlp is None:
            console("The NLP pipeline is not functioning")
            return

        nlp.max_length = nText
        doc = nlp(text)
        self.doc = doc

    def getTokens(self):
        """Get the resulting tokens.

        A token is represented as a tuple consisting of

        *   *start*: first character position that the token occupies in the text.
            Character positions start at 0.
        *   *end*: last character position that the token occupies in the text
            *plus one*.
        *   *text*: text of the token, **excluding any trailing whitespace**.
        *   *space*: any white space behind the token, if present, otherwise
            the empty string.


        !!! note "End position and space"
            If there is a space behind the token, it will not add to the end position
            of the token. So the start and end positions of the tokens reflect
            where the tokens themselves are, and spaces do nto belong to the tokens.

        Returns
        -------
        list
            All tokens as tuples.
        """
        doc = self.doc
        if doc is None:
            console("No results available from the NLP pipeline")
            return []

        canTag = self.canTag
        canMorph = self.canMorph
        canLemma = self.canLemma

        result = []

        for token in doc:
            start = token.idx
            text = token.text
            space = token.whitespace_
            end = start + len(text)

            pos = token.pos_ if canMorph else token.tag_ if canTag else None
            morph = str(token.morph) if canMorph else None
            lemma = token.lemma_.strip().lower() if canLemma else None
            result.append((start, end, text, space, pos, morph, lemma))

        return result

    def getSentences(self):
        """Get the resulting sentences.

        A sentence is represented as a tuple consisting of

        *   *start*: first character position that the sentence occupies in the text.
            Character positions start at 0.
        *   *end*: last character position that the sentence occupies in the text
            *plus one*.
        *   *text*: text of the sentence.

        Returns
        -------
        list
            All sentences as tuples.
        """
        doc = self.doc
        if doc is None:
            console("No results available from the NLP pipeline")
            return []

        if not self.canSentence:
            console("No sentence results available from the NLP pipeline")
            return []

        result = []

        whiteRe = re.compile(r"^[.?!\s]*$", re.S)
        spuriousNlBefore = re.compile(r"\n+(\W)")
        spuriousNlAfter = re.compile(r"(\W)\n+")

        for s in doc.sents:
            text = s.text.strip("\n")
            if whiteRe.match(text):
                continue

            tokenStart = doc[s.start]
            tokenEnd = doc[s.end - 1]
            sentStart = tokenStart.idx
            sentEnd = tokenEnd.idx + len(tokenEnd.text)

            text = spuriousNlBefore.sub(r"\1", text)
            text = spuriousNlAfter.sub(r"\1", text)
            result.append((sentStart, sentEnd, text))

        return result

    def getEntities(self):
        """Get the resulting named entities.

        A named entity is represented as a tuple consisting of

        *   *start*: first character position that the entity occupies in the text.
            Character positions start at 0.
        *   *end*: last character position that the entity occupies in the text
            *plus one*.
        *   *text*: text of the entity.
        *   *kind*: kind of the entity.

        Returns
        -------
        list
            All entities as tuples.
        """
        doc = self.doc
        if doc is None:
            console("No results available from the NLP pipeline")
            return []

        if not hasattr(doc, "ents"):
            console("No entity results available from the NLP pipeline")
            return []

        result = []

        for ent in doc.ents:
            start = ent.start_char
            end = ent.end_char
            text = ent.text
            kind = ent.label_
            result.append((start, end, text, kind))

        return result

Methods

def getEntities(self)

Get the resulting named entities.

A named entity is represented as a tuple consisting of

  • start: first character position that the entity occupies in the text. Character positions start at 0.
  • end: last character position that the entity occupies in the text plus one.
  • text: text of the entity.
  • kind: kind of the entity.

Returns

list
All entities as tuples.
Expand source code Browse git
def getEntities(self):
    """Get the resulting named entities.

    A named entity is represented as a tuple consisting of

    *   *start*: first character position that the entity occupies in the text.
        Character positions start at 0.
    *   *end*: last character position that the entity occupies in the text
        *plus one*.
    *   *text*: text of the entity.
    *   *kind*: kind of the entity.

    Returns
    -------
    list
        All entities as tuples.
    """
    doc = self.doc
    if doc is None:
        console("No results available from the NLP pipeline")
        return []

    if not hasattr(doc, "ents"):
        console("No entity results available from the NLP pipeline")
        return []

    result = []

    for ent in doc.ents:
        start = ent.start_char
        end = ent.end_char
        text = ent.text
        kind = ent.label_
        result.append((start, end, text, kind))

    return result
def getSentences(self)

Get the resulting sentences.

A sentence is represented as a tuple consisting of

  • start: first character position that the sentence occupies in the text. Character positions start at 0.
  • end: last character position that the sentence occupies in the text plus one.
  • text: text of the sentence.

Returns

list
All sentences as tuples.
Expand source code Browse git
def getSentences(self):
    """Get the resulting sentences.

    A sentence is represented as a tuple consisting of

    *   *start*: first character position that the sentence occupies in the text.
        Character positions start at 0.
    *   *end*: last character position that the sentence occupies in the text
        *plus one*.
    *   *text*: text of the sentence.

    Returns
    -------
    list
        All sentences as tuples.
    """
    doc = self.doc
    if doc is None:
        console("No results available from the NLP pipeline")
        return []

    if not self.canSentence:
        console("No sentence results available from the NLP pipeline")
        return []

    result = []

    whiteRe = re.compile(r"^[.?!\s]*$", re.S)
    spuriousNlBefore = re.compile(r"\n+(\W)")
    spuriousNlAfter = re.compile(r"(\W)\n+")

    for s in doc.sents:
        text = s.text.strip("\n")
        if whiteRe.match(text):
            continue

        tokenStart = doc[s.start]
        tokenEnd = doc[s.end - 1]
        sentStart = tokenStart.idx
        sentEnd = tokenEnd.idx + len(tokenEnd.text)

        text = spuriousNlBefore.sub(r"\1", text)
        text = spuriousNlAfter.sub(r"\1", text)
        result.append((sentStart, sentEnd, text))

    return result
def getTokens(self)

Get the resulting tokens.

A token is represented as a tuple consisting of

  • start: first character position that the token occupies in the text. Character positions start at 0.
  • end: last character position that the token occupies in the text plus one.
  • text: text of the token, excluding any trailing whitespace.
  • space: any white space behind the token, if present, otherwise the empty string.

End position and space

If there is a space behind the token, it will not add to the end position of the token. So the start and end positions of the tokens reflect where the tokens themselves are, and spaces do nto belong to the tokens.

Returns

list
All tokens as tuples.
Expand source code Browse git
def getTokens(self):
    """Get the resulting tokens.

    A token is represented as a tuple consisting of

    *   *start*: first character position that the token occupies in the text.
        Character positions start at 0.
    *   *end*: last character position that the token occupies in the text
        *plus one*.
    *   *text*: text of the token, **excluding any trailing whitespace**.
    *   *space*: any white space behind the token, if present, otherwise
        the empty string.


    !!! note "End position and space"
        If there is a space behind the token, it will not add to the end position
        of the token. So the start and end positions of the tokens reflect
        where the tokens themselves are, and spaces do nto belong to the tokens.

    Returns
    -------
    list
        All tokens as tuples.
    """
    doc = self.doc
    if doc is None:
        console("No results available from the NLP pipeline")
        return []

    canTag = self.canTag
    canMorph = self.canMorph
    canLemma = self.canLemma

    result = []

    for token in doc:
        start = token.idx
        text = token.text
        space = token.whitespace_
        end = start + len(text)

        pos = token.pos_ if canMorph else token.tag_ if canTag else None
        morph = str(token.morph) if canMorph else None
        lemma = token.lemma_.strip().lower() if canLemma else None
        result.append((start, end, text, space, pos, morph, lemma))

    return result
def read(self, text)

Process a plain text.

A text is ingested and tokenized. Sentences are detected. This may require quite some processing time, think of 30 seconds for 200,000 words on a decent laptop.

Parameters

text : string
The complete, raw text.
Expand source code Browse git
def read(self, text):
    """Process a plain text.

    A text is ingested and tokenized. Sentences are detected.
    This may require quite some processing time, think of 30 seconds for 200,000
    words on a decent laptop.

    Parameters
    ----------
    text: string
        The complete, raw text.
    """
    nText = len(text)
    nlp = self.nlp

    if nlp is None:
        console("The NLP pipeline is not functioning")
        return

    nlp.max_length = nText
    doc = nlp(text)
    self.doc = doc