Module tf.writing.transcription
Transcription
Text-Fabric has support for several writing systems, by means of transcription tables and fonts that will be invoked when displaying the main text.
It also calls functions to use these tables for converting Hebrew and Syriac text material to transliterated representations and back.
There is also a phonetic transcription for Hebrew, designed in phono.ipynb
Character tables and fonts
hbo
Hebrew
tf.writing.hebrew
: full list of characters covered by the ETCBC and phonetic transcriptions
Font Ezra SIL
.
syc
Syriac
tf.writing.syriac
: full list of characters covered by the ETCBC transcriptions
Font Estrangelo Edessa
.
ara
Arabic
tf.writing.arabic
: full list of characters covered by the transcription used for the Quran
Font AmiriQuran
.
grc
Greek
Font Gentium
.
akk
Akkadian
Font Santakku
.
cld
Neo Aramaic
Font CharisSIL-R
.
Expand source code Browse git
"""
# Transcription
Text-Fabric has support for several writing systems, by means of
transcription tables and fonts that will be invoked when displaying the main text.
It also calls functions to use these tables for converting Hebrew and Syriac
text material to transliterated representations and back.
There is also a phonetic transcription for Hebrew, designed in
[phono.ipynb](https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb)
## Character tables and fonts
### `hbo` Hebrew
`tf.writing.hebrew`: full list of characters covered by the ETCBC and phonetic transcriptions
Font `Ezra SIL`.
### `syc` Syriac
`tf.writing.syriac`: full list of characters covered by the ETCBC transcriptions
Font `Estrangelo Edessa`.
### `ara` Arabic
`tf.writing.arabic`: full list of characters covered by the transcription used for the Quran
Font `AmiriQuran`.
### `grc` Greek
Font `Gentium`.
### `akk` Akkadian
Font `Santakku`.
### `cld` Neo Aramaic
Font `CharisSIL-R`.
"""
import re
class Transcription:
"""Conversion between unicode and various transcriptions.
Usage notes:
Invoke the transcription functionality as follows:
from tf.writing.transcription import Transcription
Some of the attributes and methods below are *class* attributes,
others are instance attributes.
A class attribute `aaa` can be retrieved by saying
Transcription.aaa
To retrieve an instance attribute, you need an instance first, like
tr = Transcription()
and then you can say `tr.aaa`.
"""
decomp = {
"\u05E9\u05C1": "\uFB2A",
"\u05E9\u05C2": "\uFB2B",
}
hebrew_mapping = {
"_": " ", # space inside word
"92": "\u0591", # etnahta = atnach
"01": "\u0592", # segolta
"65": "\u0593", # shalshelet
"80": "\u0594", # zaqef_qatan
"85": "\u0595", # zaqef_gadol
"73": "\u0596", # tipeha = tifcha
"81": "\u0597", # revia = rebia
"82": "\u0598", # zarqa = tsinorit = zinorit = sinnorit
"03": "\u0599", # pashta
"10": "\u059A", # yetiv = yetib
"91": "\u059B", # tevir = tebir
"61": "\u059C", # geresh
"11": "\u059D", # geresh muqdam = mugrash
"62": "\u059E", # gershayim = garshayim
"84": "\u059F", # qarney para = pazer_gadol
"14": "\u05A0", # telisha_gedola
"44": "\u05A0", # telisha_gedola = telisha_gedola_med
"83": "\u05A1", # pazer
"74": "\u05A3", # munah = munach
"70": "\u05A4", # mahapakh = mehuppach
"71": "\u05A5", # merkha = merecha
"72": "\u05A6", # merkha kefula = merecha_kepula
"94": "\u05A7", # darga
"63": "\u05A8", # qadma = azla
"33": "\u05A8", # pashta_med < qadma
"04": "\u05A9", # telisha_qetana
"24": "\u05A9", # telisha_qetana = telisha_qetana_med
"93": "\u05AA", # yera ben yomo = yerach
"60": "\u05AB", # ole = ole_weyored
"64": "\u05AC", # iluy = illuy
"13": "\u05AD", # dehi = dechi
"02": "\u05AE", # zinor = sinnor
"*": "\u05AF", # masora circle
":": "\u05B0", # sheva = shewa
":E": "\u05B1", # hataf segol = chataph_segol
":A": "\u05B2", # hataf patah = chataph_patach
":@": "\u05B3", # hataf qamats = chataph_qamats
"I": "\u05B4", # hiriq = chiriq
";": "\u05B5", # tsere
"E": "\u05B6", # segol
"A": "\u05B7", # patach
"@": "\u05B8", # qamats
"O": "\u05B9", # holam = cholam
"U": "\u05BB", # qubuts = qubbuts
".": "\u05BC", # dagesh
"25": "\u05BD", # silluq yamin
"45": "\u05BD", # meteg
"35": "\u05BD", # meteg (tikon)
"75": "\u05BD", # siluq = silluq
"95": "\u05BD", # meteg = meteg_yamin
"&": "\u05BE", # maqaf
",": "\u05BF", # rafe = raphe
"05": "\u05C0", # paseq
".c": "\u05C1", # shin dot
".f": "\u05C2", # sin dot
"00": "\u05C3", # sof_pasuq
"52": "\u05C4", # upper dot = puncta_above
"53": "\u05C5", # lower dot = puncta_below
"ñ": "\u05C6\u0307", # nun hafukha
"Ñ": "\u05C6\u0307", # nun hafukha
">": "\u05D0", # alef
"B": "\u05D1", # bet
"G": "\u05D2", # gimel
"D": "\u05D3", # dalet
"H": "\u05D4", # he
"W": "\u05D5", # vav
"Z": "\u05D6", # zayin
"X": "\u05D7", # het
"V": "\u05D8", # tet
"J": "\u05D9", # yod
"k": "\u05DA", # kaf final
"K": "\u05DB", # kaf
"L": "\u05DC", # lamed
"m": "\u05DD", # mem final
"M": "\u05DE", # mem
"n": "\u05DF", # nun final
"N": "\u05E0", # nun
"S": "\u05E1", # samekh
"<": "\u05E2", # ayin
"p": "\u05E3", # pe final
"P": "\u05E4", # pe
"y": "\u05E5", # tsadi final
"Y": "\u05E6", # tsadi
"Q": "\u05E7", # qof
"R": "\u05E8", # resh
"#": "\u05E9", # sin unpointed
"T": "\u05EA", # tav
"C": "\uFB2A", # shin pointed
"F": "\uFB2B", # sin pointed
"55": "<UNMAPPED 55=large letter>", # large_letter
"56": "<UNMAPPED 56=small letter>", # small_letter
"57": "<UNMAPPED 57=suspended letter>", # suspended_letter
"-": "", # suppress space afterward
"'": "\u05f3", # punctuation geresh
'"': "\u05f4", # punctuation gershayim
}
"""
Maps all ETCBC transliteration character combinations for Hebrew to Unicode.
Example: sof-pasuq:
Transcription.hebrew_mapping['00']
Output:
׃
"""
hebrew_cons = ">BGDHWZXVJKLMNS<PYQRFCT"
trans_final_pat = re.compile(
r"(["
+ hebrew_cons
+ r"][^_&]*)([KMNPY])([^"
+ hebrew_cons
+ r"_&]*(:?[_&]|\Z))"
)
trans_hebrew_pat = re.compile(r"(:[AE@]|.[cf]|:|[0-9][0-9]|.)")
swap_accent_pat = re.compile(
r"(\A|[_&])([0-9][0-9])([" + hebrew_cons + r"])([:;,.EAIOU@]*)"
)
remove_accent_pat = re.compile(r"((?:[0-9][0-9])|[,*])")
remove_point_pat = re.compile(
r"((?:[0-9][0-9])|(?:\.[cf])|(?::[@AE])|[,.:;@AEIOU*])"
)
remove_psn_pat = re.compile(r"00[ _SPNÑñ]*")
remove_psq_pat = re.compile(r"(?:[ _]+05[ _]*)|(?:05[ _]+)")
shin_pat = re.compile(r"[CF]")
ph_simple_pat = re.compile(r"([ˈˌᵊᵃᵒᵉāo*])")
noorigspace = re.compile(
r"""
(?: [&-]\Z) # space, maqef or nospace
| (?:
0[05] # sof pasuq or paseq
(?:_[SNP])* # nun hafukha, setumah, petuhah at end of verse
\Z
)
| (?:_[SPN])+ # nun hafukha, setumah, petuhah between words
""",
re.X,
)
syriac_mapping_simple = {
">": "\u0710", # alaph
"B": "\u0712", # beth
"G": "\u0713", # gamal
"D": "\u0715", # dalat
"H": "\u0717", # he
"W": "\u0718", # waw
"Z": "\u0719", # zain
"X": "\u071A", # heth
"V": "\u071B", # teth
"J": "\u071D", # yudh
"K": "\u071F", # kaph
"L": "\u0720", # lamadh
"M": "\u0721", # mim
"N": "\u0722", # nun
"S": "\u0723", # semkath
"<": "\u0725", # e
"P": "\u0726", # pe
"Y": "\u0728", # sadhe
"Q": "\u0729", # qaph
"R": "\u072A", # rish
"C": "\u072B", # shin
"T": "\u072C", # taw
"s": "\u0724", # semkath final
"p": "\u0727", # pe reversed
}
syriac_mapping_pil = {
# LETTERS
"'": "\u0710", # alaph
"b": "\u0712", # beth
"g": "\u0713", # gamal
"d": "\u0715", # dalat
"h": "\u0717", # he
"w": "\u0718", # waw
"z": "\u0719", # zain
"H": "\u071A", # heth
"T": "\u071B", # teth
"y": "\u071D", # yod
"k": "\u071F", # kaf
"l": "\u0720", # lamad
"m": "\u0721", # mim
"n": "\u0722", # nun
"s": "\u0723", # semkath
"`": "\u0725", # 'e
"p": "\u0726", # pe
"S": "\u0728", # tsade
"q": "\u0729", # qof
"r": "\u072A", # resh
"$": "\u072B", # shin
"t": "\u072C", # taw
# WORD-BOUND DIACRITICS
'"': "\u0308", # seyame
"#": "\u0323", # diacritical dot below
"^": "\u0307", # diacritical dot above
"~": "\u0307", # abbreviation mark
# NON-VOCALIC LETTER-BOUND DIACRITICS
"#,": "\u0742", # rukkakha
'#"': "\u0342", # unclear (COMBINING DIAERESIS BELOW)
"#!": "\u0744", # unclear (SYRIAC TWO VERTICAL DOTS BELOW)
"#_": "\u0331", # linea occultans infera
"^,": "\u0741", # qushshaya
"^!": "\u0743", # unclear (SYRIAC TWO VERTICAL DOTS ABOVE)
"^_": "\u0304", # linea occultans supera
# VOCALIC LETTER-BOUND DIACRITICS
":": "", # shewa
"A": "\u0733", # qamets
"A1": "\u0734", # zeqapa
"A2": "\u0735", # zeqofo
"E": "\u0739", # tsere, revasa karya
"O": "\u073F", # holem, rewaha
"a": "\u0730", # patah
"a1": "\u0731", # petaha
"a2": "\u0732", # petoho
"e": "\u0736", # segol
"e1": "\u0737", # revasa arrika
"e2": "\u0738", # revoso
"i": "\u073A", # hireq
"i1": "\u073B", # hevoso
"y#": "\u071D\u073C", # hevasa
"u": "\u073D", # qubbuts
"u1": "\u073E", # esoso
"w#": "\u0718\u073C", # esasa allisa
"w^": "\u0718\u073F", # esasa rewiha
# INTERPUNCTION
"#.": "\u0702", # menachta, meshalyana (ES), metdamrana, samka
"#:": "\u0704", # metkashpana (ES)
"#\\": "\u0709", # tahtaya, metkashpana (WS), meshalyana (WS)
"=.": "\u002E", # pasuqa
"=/": "\u0707", # elaya
"=:": "\u003A", # shewaya (WS), zauga (ES)
"=\\": "\u0706", # unclear (SYRIAC COLON SKEWED LEFT)
"^.": "\u0701", # paquda, metkashpana (ES), meshalyana (ES), etsyana, meshalana?
'^"': "\u0705", # rahta
"^:": "\u0703", # taksa (WS), zauga elaya (ES)
"^\\": "\u0708", # unclear (SYRIAC SUPRALINEAR COLON SKEWED LEFT)
# PERICOPE MARKERS
"*": "\u0700", # rosette
".": "\u00B7", # common dot in caesuras
"@": "\u2722", # vignette
"_": "\u2014", # dash in caesuras
"o": "\u2022", # large dot in caesuras
}
syriac_mapping = { # this is WIT
# LETTERS
">": "\u0710", # alaph
"B": "\u0712", # beth
"G": "\u0713", # gamal
"D": "\u0715", # dalat
"H": "\u0717", # he
"W": "\u0718", # waw
"Z": "\u0719", # zain
"X": "\u071A", # heth
"V": "\u071B", # teth
"J": "\u071D", # yod
"K": "\u071F", # kaf
"L": "\u0720", # lamad
"M": "\u0721", # mim
"N": "\u0722", # nun
"S": "\u0723", # semkath
"<": "\u0725", # 'e
"P": "\u0726", # pe
"Y": "\u0728", # tsade
"Q": "\u0729", # qof
"R": "\u072A", # resh
"C": "\u072B", # shin
"T": "\u072C", # taw
# WORD-BOUND DIACRITICS
'"': "\u0308", # seyame
"#": "\u0323", # diacritical dot below
"^": "\u0307", # diacritical dot above
# NON-VOCALIC LETTER-BOUND DIACRITICS
"^!": "\u0743", # unclear (SYRIAC TWO VERTICAL DOTS ABOVE)
# VOCALIC LETTER-BOUND DIACRITICS
":": "", # shewa
"A": "\u0733", # qamets
"A1": "\u0734", # zeqapa
"A2": "\u0735", # zeqofo
"O": "\u073F", # holem, rewaha
"@": "\u0730", # patah
"@1": "\u0731", # petaha
"@2": "\u0732", # petoho
"E": "\u0736", # segol
"E1": "\u0737", # revasa arrika
"E2": "\u0738", # revoso
"I": "\u073A", # hireq
"I1": "\u073B", # hevoso
"U": "\u073D", # qubbuts
"U1": "\u073E", # esoso
# INTERPUNCTION
"#\\": "\u0709", # tahtaya, metkashpana (WS), meshalyana (WS)
"=.": "\u002E", # pasuqa
"=#": "\u0707", # elaya
"=:": "\u003A", # shewaya (WS), zauga (ES)
"=^": "\u0706", # unclear (SYRIAC COLON SKEWED LEFT)
"=/": "\u0707", # elaya
"=\\": "\u0706", # unclear (SYRIAC COLON SKEWED LEFT)
"^:": "\u0703", # taksa (WS), zauga elaya (ES)
"^\\": "\u0708", # unclear (SYRIAC SUPRALINEAR COLON SKEWED LEFT)
# PERICOPE MARKERS
"*": "\u0700", # rosette
".": "\u00B7", # common dot in caesuras
"_": "\u2014", # dash in caesuras
"o": "\u2022", # large dot in caesuras
}
"""
Maps all ETCBC transliteration character combinations for Syriac to Unicode.
Example: semkath-final:
Transcription.syriac_mapping['s']
Output:
ܤ
"""
trans_syriac_pat = re.compile(r"([AE@IU][12]?|=[.#:\^/\\]|[\^#][!:\\]|.)")
arabic_mapping = {
" ": "\u0020", # SPACE
"'": "\u0621", # ARABIC LETTER HAMZA
">": "\u0623", # ARABIC LETTER ALEF WITH HAMZA ABOVE
"&": "\u0624", # ARABIC LETTER WAW WITH HAMZA ABOVE
"<": "\u0625", # ARABIC LETTER ALEF WITH HAMZA BELOW
"}": "\u0626", # ARABIC LETTER YEH WITH HAMZA ABOVE
"A": "\u0627", # ARABIC LETTER ALEF
"b": "\u0628", # ARABIC LETTER BEH
"p": "\u0629", # ARABIC LETTER TEH MARBUTA
"t": "\u062a", # ARABIC LETTER TEH
"v": "\u062b", # ARABIC LETTER THEH
"j": "\u062c", # ARABIC LETTER JEEM
"H": "\u062d", # ARABIC LETTER HAH
"x": "\u062e", # ARABIC LETTER KHAH
"d": "\u062f", # ARABIC LETTER DAL
"*": "\u0630", # ARABIC LETTER THAL
"r": "\u0631", # ARABIC LETTER REH
"z": "\u0632", # ARABIC LETTER ZAIN
"s": "\u0633", # ARABIC LETTER SEEN
"$": "\u0634", # ARABIC LETTER SHEEN
"S": "\u0635", # ARABIC LETTER SAD
"D": "\u0636", # ARABIC LETTER DAD
"T": "\u0637", # ARABIC LETTER TAH
"Z": "\u0638", # ARABIC LETTER ZAH
"E": "\u0639", # ARABIC LETTER AIN
"g": "\u063a", # ARABIC LETTER GHAIN
"_": "\u0640", # ARABIC TATWEEL
"f": "\u0641", # ARABIC LETTER FEH
"q": "\u0642", # ARABIC LETTER QAF
"k": "\u0643", # ARABIC LETTER KAF
"l": "\u0644", # ARABIC LETTER LAM
"m": "\u0645", # ARABIC LETTER MEEM
"n": "\u0646", # ARABIC LETTER NOON
"h": "\u0647", # ARABIC LETTER HEH
"w": "\u0648", # ARABIC LETTER WAW
"Y": "\u0649", # ARABIC LETTER ALEF MAKSURA
"y": "\u064a", # ARABIC LETTER YEH
"F": "\u064b", # ARABIC FATHATAN
"N": "\u064c", # ARABIC DAMMATAN
"K": "\u064d", # ARABIC KASRATAN
"a": "\u064e", # ARABIC FATHA
"u": "\u064f", # ARABIC DAMMA
"i": "\u0650", # ARABIC KASRA
"~": "\u0651", # ARABIC SHADDA
"o": "\u0652", # ARABIC SUKUN
"^": "\u0653", # ARABIC MADDAH ABOVE
"#": "\u0654", # ARABIC HAMZA ABOVE
"`": "\u0670", # ARABIC LETTER SUPERSCRIPT ALEF
"{": "\u0671", # ARABIC LETTER ALEF WASLA
":": "\u06dc", # ARABIC SMALL HIGH SEEN
"@": "\u06df", # ARABIC SMALL HIGH ROUNDED ZERO
'"': "\u06e0", # ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
"[": "\u06e2", # ARABIC SMALL HIGH MEEM ISOLATED FORM
";": "\u06e3", # ARABIC SMALL LOW SEEN
",": "\u06e5", # ARABIC SMALL WAW
".": "\u06e6", # ARABIC SMALL YEH
"!": "\u06e8", # ARABIC SMALL HIGH NOON
"-": "\u06ea", # ARABIC EMPTY CENTRE LOW STOP
"+": "\u06eb", # ARABIC EMPTY CENTRE HIGH STOP
"%": "\u06ec", # ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
"]": "\u06ed", # ARABIC SMALL LOW MEEM
}
"""
Maps an Arabic transliteration character to Unicode.
This is the mapping used in the Quran representation on tanzil.net.
Example: beh
Transcription.syriac_mapping['b']
Output:
ب
Maps an Arabic letter in unicode to its transliteration
Example: beh transliteration
Transcription.syriac_mapping['ب']
Output:
b
"""
arabic_mappingi = {v: k for (k, v) in arabic_mapping.items()}
arabicTrans = {
"\u0020": (" ", " ", " ", " "), # SPACE
"\u060c": ("‚", ",", ",", ","), # ARABIC COMMA
"\u061b": ("„", ";", ";", ";"), # ARABIC SEMICOLON
"\u061f": ("?", "?", "?", "?"), # ARABIC QUESTION MARK
"\u0621": ("'", "'", "ʾ", "'"), # ARABIC LETTER HAMZA
"\u0622": ("»", "_a", "ā", "ā"), # ARABIC LETTER ALEF WITH MADDA ABOVE
"\u0623": (">", "_a", "ā", "ā"), # ARABIC LETTER ALEF WITH HAMZA ABOVE
"\u0624": ("&", "'", "ʾ", "'"), # ARABIC LETTER WAW WITH HAMZA ABOVE
"\u0625": ("<", "_a", "ā", "ā"), # ARABIC LETTER ALEF WITH HAMZA BELOW
"\u0626": ("}", "'", "ʾ", "y"), # ARABIC LETTER YEH WITH HAMZA ABOVE
"\u0627": ("A", "_a", "ā", "ā"), # ARABIC LETTER ALEF
"\u0628": ("b", "b", "b", "b"), # ARABIC LETTER BEH
"\u0629": ("p", "=t", "ŧ", "t"), # ARABIC LETTER TEH MARBUTA
"\u062a": ("t", "t", "t", "t"), # ARABIC LETTER TEH
"\u062b": ("v", "_t", "ṯ", "th"), # ARABIC LETTER THEH
"\u062c": ("j", "j", "ǧ", "j"), # ARABIC LETTER JEEM
"\u062d": ("H", "*h", "ḥ", "ḥ"), # ARABIC LETTER HAH
"\u062e": ("x", "_h", "ḫ", "kh"), # ARABIC LETTER KHAH
"\u062f": ("d", "d", "d", "d"), # ARABIC LETTER DAL
"\u0630": ("*", "_d", "ḏ", "dh"), # ARABIC LETTER THAL
"\u0631": ("r", "r", "r", "r"), # ARABIC LETTER REH
"\u0632": ("z", "z", "z", "z"), # ARABIC LETTER ZAIN
"\u0633": ("s", "s", "s", "s"), # ARABIC LETTER SEEN
"\u0634": ("$", "^s", "š", "sh"), # ARABIC LETTER SHEEN
"\u0635": ("S", "*s", "ṣ", "ṣ"), # ARABIC LETTER SAD
"\u0636": ("D", "*d", "ḍ", "ḍ"), # ARABIC LETTER DAD
"\u0637": ("T", "*t", "ṭ", "ṭ"), # ARABIC LETTER TAH
"\u0638": ("Z", "*z", "ẓ", "ẓ"), # ARABIC LETTER ZAH
"\u0639": ("E", "`", "ʿ", "`"), # ARABIC LETTER AIN
"\u063a": ("g", "*g", "ġ", "gh"), # ARABIC LETTER GHAIN
"\u0640": ("_", "", "", ""), # ARABIC TATWEEL
"\u0641": ("f", "f", "f", "f"), # ARABIC LETTER FEH
"\u0642": ("q", "*k", "ḳ", "q"), # ARABIC LETTER QAF
"\u0643": ("k", "k", "k", "k"), # ARABIC LETTER KAF
"\u0644": ("l", "l", "l", "l"), # ARABIC LETTER LAM
"\u0645": ("m", "m", "m", "m"), # ARABIC LETTER MEEM
"\u0646": ("n", "n", "n", "n"), # ARABIC LETTER NOON
"\u0647": ("h", "h", "h", "h"), # ARABIC LETTER HEH
"\u0648": ("w", "w", "w", "w"), # ARABIC LETTER WAW
"\u0649": ("Y", "/a", "á", "ā"), # ARABIC LETTER ALEF MAKSURA
"\u064a": ("y", "y", "y", "y"), # ARABIC LETTER YEH
"\u064b": ("F", "a*n", "aⁿ", "an"), # ARABIC FATHATAN
"\u064c": ("N", "u*n", "uⁿ", "un"), # ARABIC DAMMATAN
"\u064d": ("K", "i*n", "iⁿ", "in"), # ARABIC KASRATAN
"\u064e": ("a", "a", "a", "a"), # ARABIC FATHA
"\u064f": ("u", "u", "u", "u"), # ARABIC DAMMA
"\u0650": ("i", "i", "i", "i"), # ARABIC KASRA
"\u0651": ("~", "u", "u", "ūw"), # ARABIC SHADDA
"\u0652": ("o", "a", "a", "a"), # ARABIC SUKUN
"\u0653": ("^", "_a", "ā", "ā"), # ARABIC MADDAH ABOVE
"\u0654": ("#", "'", "ʾ", "ā"), # ARABIC HAMZA ABOVE
"\u0655": ("=", "'", "ʾ", "ā"), # ARABIC HAMZA BELOW
"\u0660": ("0", "0", "0", "0"), # ARABIC INDIC DIGIT ZERO
"\u0661": ("1", "1", "1", "1"), # ARABIC INDIC DIGIT ONE
"\u0662": ("2", "2", "2", "2"), # ARABIC INDIC DIGIT TWO
"\u0663": ("3", "3", "3", "3"), # ARABIC INDIC DIGIT THREE
"\u0664": ("4", "4", "4", "4"), # ARABIC INDIC DIGIT FOUR
"\u0665": ("5", "5", "5", "5"), # ARABIC INDIC DIGIT FIVE
"\u0666": ("6", "6", "6", "6"), # ARABIC INDIC DIGIT SIX
"\u0667": ("7", "7", "7", "7"), # ARABIC INDIC DIGIT SEVEN
"\u0668": ("8", "8", "8", "8"), # ARABIC INDIC DIGIT EIGHT
"\u0669": ("9", "9", "9", "9"), # ARABIC INDIC DIGIT NINE
"\u0670": ("`", "~a", "ã", ""), # ARABIC LETTER SUPERSCRIPT ALEF
"\u0671": ("{", "a", "a", "a"), # ARABIC LETTER ALEF WASLA
"\u06af": ("G", "g", "g", "g"), # ARABIC LETTER GAF
"\u06cc": ("J", "#y", "Y", "y"), # ARABIC LETTER FARSI YEH
"\u06d6": ("SlY", "*sl/a", "ṣlá", "ṣla"), # ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
"\u06da": ("M", "#j", "J", "j"), # ARABIC SMALL HIGH JEEM
"\u06dc": (":", "#s", "S", "s"), # ARABIC SMALL HIGH SEEN
"\u06df": ("@", "0", "0", "0"), # ARABIC SMALL HIGH ROUNDED ZERO
"\u06e0": ('"', "0", "0", "0"), # ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
"\u06e2": ("[", "#m", "M", "M"), # ARABIC SMALL HIGH MEEM ISOLATED FORM
"\u06e3": (";", "#s", "S", "S"), # ARABIC SMALL LOW SEEN
"\u06e5": (",", "#w", "W", "W"), # ARABIC SMALL WAW
"\u06e6": (".", "#y", "Y", "Y"), # ARABIC SMALL YEH
"\u06e8": ("!", "#n", "N", "N"), # ARABIC SMALL HIGH NOON
"\u06ea": ("-", ".", ".", "."), # ARABIC EMPTY CENTRE LOW STOP
"\u06eb": ("+", ".", ".", "."), # ARABIC EMPTY CENTRE HIGH STOP
"\u06ec": ("%", ".", ".", "."), # ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
"\u06ed": ("]", "#m", "M", "M"), # ARABIC SMALL LOW MEEM
}
"""
More Arabic transcriptions:
* column 1: `custom` [Quran-tanzil](http://tanzil.net/#1:1), slightly extended
* column 2/: `ascii` resp. `latin` plus diacritics also known as betacode.
We use a list compiled by
[Peter Verkinderen](https://pverkind.github.io/betacodeTranscriber/js/betacode.js)
* column 4: `standard` (Library of Congress) (to-be filled).
We use the
[arabic romanization list of 2012](https://www.loc.gov/catdir/cpso/romanization/arabic.pdf)
We refrain of from applying rules that cannot be computed without
lexical/grammatical/dialectical knowledge of the arabic language.
"""
arabicTransQuran = {}
arabicTransAscii = {}
arabicTransLatin = {}
arabicTransStandard = {}
for (ara, (qur, asc, lat, std)) in arabicTrans.items():
arabicTransQuran[ara] = qur
arabicTransAscii[ara] = asc
arabicTransLatin[ara] = lat
arabicTransStandard[ara] = std
def quranFromArabic(word):
return "".join(Transcription.arabicTransQuran.get(x, x) for x in word)
def asciiFromArabic(word):
return "".join(Transcription.arabicTransAscii.get(x, x) for x in word)
def latinFromArabic(word):
return "".join(Transcription.arabicTransLatin.get(x, x) for x in word)
def standardFromArabic(word):
return "".join(Transcription.arabicTransStandard.get(x, x) for x in word)
def __init__(self):
self.hebrew_consonants = {
Transcription.hebrew_mapping[x] for x in Transcription.hebrew_cons
}
self.hebrew_consonants.add("\u05E9")
self.hebrew_mappingi = dict(
(v, k) for (k, v) in Transcription.hebrew_mapping.items() if k != ""
)
self.syriac_mappingi = dict(
(v, k) for (k, v) in Transcription.syriac_mapping.items() if k != ""
)
# special treatment needed for nun hafukha,
# since it is consists of two characters
self.hebrew_mappingi["\u05C6"] = "ñ"
self.hebrew_mappingi["\u0307"] = ""
self.syriac_punctuation_trans = (
"#\\",
"=.",
"=#",
"=:",
"=^",
"=/",
"=\\",
"^:",
"^\\",
)
self.syriac_punctuation_syc = tuple(
Transcription.syriac_mapping[c] for c in self.syriac_punctuation_trans
)
arabicMap = dict(qur={}, asc={}, lat={}, std={})
arabicMapI = dict(qur={}, asc={}, lat={}, std={})
for u in sorted(Transcription.arabicTrans):
(qur, asc, lat, std) = Transcription.arabicTrans[u]
for (k, v) in zip(arabicMap.keys(), Transcription.arabicTrans[u]):
arabicMap[k].setdefault(v, u)
arabicMapI[k][u] = v
self.arabicMap = arabicMap
self.arabicMapI = arabicMapI
def sycSplitPunc(self):
pass
def _comp(s):
for (d, c) in Transcription.decomp.items():
s = s.replace(d, c)
return s
def _decomp(s):
for (d, c) in Transcription.decomp.items():
s = s.replace(c, d)
return s
def suffix_and_finales(word):
"""
Given an ETCBC transliteration, split it into the word material
and the interword material that follows it (space, punctuation).
Replace the last consonant of the word material by its final form, if applicable.
Output a tuple with the modified word material and the interword material.
Example:
Transcription.suffix_and_finales('71T_H@>@95REY00')
Output:
('71T_H@>@95REy', '00\n')
Note that the `Y` has been replaced by `y`.
"""
# first split the word proper from the suffix,
# and add a space if there is no other suffix
add_space = ""
suffix = ""
new_word = word
if not word:
return (new_word, suffix + add_space)
lastch = new_word[-1]
if lastch == "-" or lastch == "&":
new_word = new_word[0:-1]
suffix = lastch
else:
if len(new_word) >= 2:
lastch = new_word[-1]
llastch = new_word[-2]
if llastch == "_" and (lastch == "P" or lastch == "S"):
new_word = new_word[0:-2]
suffix = " " + lastch + suffix + " "
if len(new_word) >= 2:
lastch = new_word[-1]
llastch = new_word[-2]
if llastch == "_" and (lastch == "N"):
new_word = new_word[0:-2]
suffix = " ñ" + suffix + " "
if len(new_word) >= 2:
lastch = new_word[-1]
llastch = new_word[-2]
if llastch == "0" and (lastch == "0" or lastch == "5"):
new_word = new_word[0:-2]
suffix = (" " if lastch == "5" else "") + llastch + lastch + suffix
add_space = "\n" if lastch == "0" else " "
if suffix == "":
add_space = " "
elif suffix == "-":
add_space = ""
suffix = ""
# second: replace consonants by their final forms when needed
new_word = Transcription.trans_final_pat.sub(Transcription._map_final, new_word)
return (new_word, suffix + add_space)
def _map_final(m):
return m.group(1) + m.group(2).lower() + m.group(3)
def _map_hebrew(m):
return Transcription.hebrew_mapping.get(m.group(1), m.group(1))
def _map_syriac(m):
return Transcription.syriac_mapping.get(m.group(1), m.group(1))
def _swap_accent(m):
return m.group(1) + m.group(3) + m.group(4) + m.group(2)
def _remove_accent(m):
return "00" if m.group(1) == "00" else "05" if m.group(1) == "05" else ""
def _remove_point(m):
return "00" if m.group(1) == "00" else "05" if m.group(1) == "05" else ""
def _ph_simple(m):
return "å" if m.group(1) in "āo" else ""
# unicode normalization is harmful
# if there is a combination of dagesh, vowel and accent.
def suppress_space(word):
"""
Given an ETCBC transliteration of a word,
match the end of the word for interpunction and spacing characters
(sof pasuq, paseq, nun hafukha, setumah, petuhah, space, no-space)
Example:
Transcription.suppress_space('B.:&')
Transcription.suppress_space('B.@R@74>')
Transcription.suppress_space('71T_H@>@95REY00')
Output:
<re.Match object; span=(3, 4), match='&'>
None
<re.Match object; span=(13, 15), match='00'>
"""
return Transcription.noorigspace.search(word)
def to_etcbc_v(word):
"""
Given an ETCBC transliteration of a fully pointed word,
strip all the non-vowel pointing (i.e. the accents).
Example:
Transcription.to_etcbc_v('HAC.@MA73JIm')
Output:
HAC.@MAJIm
"""
return Transcription.remove_accent_pat.sub(Transcription._remove_accent, word)
def to_etcbc_c(word):
"""
Given an ETCBC transliteration of a fully pointed word,
strip everything except the consonants.
Punctuation will also be stripped.
Example:
Transcription.to_etcbc_c('HAC.@MA73JIm')
Output:
H#MJM
Note that the pointed shin (`C`) is replaced by an unpointed one (`#`).
"""
word = Transcription.remove_point_pat.sub(Transcription._remove_point, word)
word = Transcription.remove_psn_pat.sub(
"00", word
) # remove nun hafukha, setumah, petuhah at the end of a verse
word = Transcription.remove_psq_pat.sub(
" ", word
) # replace paseq with attached spaces by single space
word = word.upper() # no final forms of consonants
return Transcription.shin_pat.sub("#", word)
def to_hebrew(word):
"""
Given a transliteration of a fully pointed word,
produce the word in Unicode Hebrew.
Care will be taken that vowel pointing will be added to consonants
before accent pointing.
Example:
Transcription.to_hebrew('HAC.@MA73JIm')
Output:
הַשָּׁמַ֖יִם
"""
word = Transcription.swap_accent_pat.sub(Transcription._swap_accent, word)
return Transcription.trans_hebrew_pat.sub(Transcription._map_hebrew, word)
def to_hebrew_v(word):
"""
Given a transliteration of a fully pointed word,
produce the word in Unicode Hebrew, but without the accents.
Example:
Transcription.to_hebrew_v('HAC.@MA73JIm')
Output:
הַשָּׁמַיִם
"""
return Transcription.trans_hebrew_pat.sub(
Transcription._map_hebrew, Transcription.to_etcbc_v(word)
)
def to_hebrew_c(word):
"""
Given a transliteration of a fully pointed word,
produce the word in Unicode Hebrew, but without the pointing.
Example:
Transcription.to_hebrew_c('HAC.@MA73JIm')
Output:
השמימ
Note that final consonant forms are not being used.
"""
return Transcription.trans_hebrew_pat.sub(
Transcription._map_hebrew, Transcription.to_etcbc_c(word)
)
def to_hebrew_x(word):
"""
Given a transliteration of a fully pointed word,
produce the word in Unicode Hebrew, but without the pointing.
Vowel pointing and accent pointing will be applied in the order given
by the input word.
Example:
Transcription.to_hebrew_x('HAC.@MA73JIm')
Output:
הַשָּׁמַ֖יִם
"""
return Transcription.trans_hebrew_pat.sub(Transcription._map_hebrew, word)
def ph_simplify(pword):
"""
Given a phonological transliteration of a fully pointed word,
produce a more coarse phonological transliteration.
Example:
Transcription.ph_simplify('ʔᵉlōhˈîm')
Transcription.ph_simplify('māqˈôm')
Transcription.ph_simplify('kol')
Output:
ʔlōhîm
måqôm
kål
Note that the simplified version transliterates the qamets gadol and qatan
to the same
character.
"""
return Transcription.ph_simple_pat.sub(Transcription._ph_simple, pword)
def from_hebrew(self, word):
"""
Given a fully pointed word in Unicode Hebrew,
produce the word in ETCBC transliteration.
Example:
tr.from_hebrew('הָאָֽרֶץ׃')
Output:
H@>@95REy00
"""
return "".join(
self.hebrew_mappingi.get(x, x) for x in Transcription._comp(word)
)
def to_syriac(self, word):
"""
Given a word in ETCBC transliteration,
produce the word in Unicode Syriac.
Example:
tr.to_syriac('MKSJN')
Output:
ܡܟܣܝܢ
"""
return Transcription.trans_syriac_pat.sub(Transcription._map_syriac, word)
def from_syriac(self, word):
"""
Given a word in Unicode Syriac,
produce the word in ETCBC transliteration.
Example:
tr.from_syriac('ܡܟܣܝܢ')
Output:
MKSJN
"""
return "".join(self.syriac_mappingi.get(x, x) for x in word)
def can_to_syriac(self, word):
return all(
candidate in Transcription.syriac_mapping
for candidate in Transcription.trans_syriac_pat.findall(word)
if candidate != " "
)
def can_from_syriac(self, word):
return all(c in self.syriac_mappingi for c in word if c != " ")
def to_arabic(word):
"""
Given a word in transliteration,
produce the word in Unicode Arabic.
Example:
tr.to_arabic('bisomi')
Output:
بِسْمِ
"""
return "".join(Transcription.arabic_mapping.get(x, x) for x in word)
def from_arabic(word):
"""
Given a word in Unicode Arabic,
produce the word in transliteration.
Example:
tr.from_arabic('بِسْمِ')
Output:
bisomi
"""
return "".join(Transcription.arabic_mappingi.get(x, x) for x in word)
Classes
class Transcription
-
Conversion between unicode and various transcriptions.
Usage notes:
Invoke the transcription functionality as follows:
from tf.writing.transcription import Transcription
Some of the attributes and methods below are class attributes, others are instance attributes.
A class attribute
aaa
can be retrieved by sayingTranscription.aaa
To retrieve an instance attribute, you need an instance first, like
tr = Transcription()
and then you can say
tr.aaa
.Expand source code Browse git
class Transcription: """Conversion between unicode and various transcriptions. Usage notes: Invoke the transcription functionality as follows: from tf.writing.transcription import Transcription Some of the attributes and methods below are *class* attributes, others are instance attributes. A class attribute `aaa` can be retrieved by saying Transcription.aaa To retrieve an instance attribute, you need an instance first, like tr = Transcription() and then you can say `tr.aaa`. """ decomp = { "\u05E9\u05C1": "\uFB2A", "\u05E9\u05C2": "\uFB2B", } hebrew_mapping = { "_": " ", # space inside word "92": "\u0591", # etnahta = atnach "01": "\u0592", # segolta "65": "\u0593", # shalshelet "80": "\u0594", # zaqef_qatan "85": "\u0595", # zaqef_gadol "73": "\u0596", # tipeha = tifcha "81": "\u0597", # revia = rebia "82": "\u0598", # zarqa = tsinorit = zinorit = sinnorit "03": "\u0599", # pashta "10": "\u059A", # yetiv = yetib "91": "\u059B", # tevir = tebir "61": "\u059C", # geresh "11": "\u059D", # geresh muqdam = mugrash "62": "\u059E", # gershayim = garshayim "84": "\u059F", # qarney para = pazer_gadol "14": "\u05A0", # telisha_gedola "44": "\u05A0", # telisha_gedola = telisha_gedola_med "83": "\u05A1", # pazer "74": "\u05A3", # munah = munach "70": "\u05A4", # mahapakh = mehuppach "71": "\u05A5", # merkha = merecha "72": "\u05A6", # merkha kefula = merecha_kepula "94": "\u05A7", # darga "63": "\u05A8", # qadma = azla "33": "\u05A8", # pashta_med < qadma "04": "\u05A9", # telisha_qetana "24": "\u05A9", # telisha_qetana = telisha_qetana_med "93": "\u05AA", # yera ben yomo = yerach "60": "\u05AB", # ole = ole_weyored "64": "\u05AC", # iluy = illuy "13": "\u05AD", # dehi = dechi "02": "\u05AE", # zinor = sinnor "*": "\u05AF", # masora circle ":": "\u05B0", # sheva = shewa ":E": "\u05B1", # hataf segol = chataph_segol ":A": "\u05B2", # hataf patah = chataph_patach ":@": "\u05B3", # hataf qamats = chataph_qamats "I": "\u05B4", # hiriq = chiriq ";": "\u05B5", # tsere "E": "\u05B6", # segol "A": "\u05B7", # patach "@": "\u05B8", # qamats "O": "\u05B9", # holam = cholam "U": "\u05BB", # qubuts = qubbuts ".": "\u05BC", # dagesh "25": "\u05BD", # silluq yamin "45": "\u05BD", # meteg "35": "\u05BD", # meteg (tikon) "75": "\u05BD", # siluq = silluq "95": "\u05BD", # meteg = meteg_yamin "&": "\u05BE", # maqaf ",": "\u05BF", # rafe = raphe "05": "\u05C0", # paseq ".c": "\u05C1", # shin dot ".f": "\u05C2", # sin dot "00": "\u05C3", # sof_pasuq "52": "\u05C4", # upper dot = puncta_above "53": "\u05C5", # lower dot = puncta_below "ñ": "\u05C6\u0307", # nun hafukha "Ñ": "\u05C6\u0307", # nun hafukha ">": "\u05D0", # alef "B": "\u05D1", # bet "G": "\u05D2", # gimel "D": "\u05D3", # dalet "H": "\u05D4", # he "W": "\u05D5", # vav "Z": "\u05D6", # zayin "X": "\u05D7", # het "V": "\u05D8", # tet "J": "\u05D9", # yod "k": "\u05DA", # kaf final "K": "\u05DB", # kaf "L": "\u05DC", # lamed "m": "\u05DD", # mem final "M": "\u05DE", # mem "n": "\u05DF", # nun final "N": "\u05E0", # nun "S": "\u05E1", # samekh "<": "\u05E2", # ayin "p": "\u05E3", # pe final "P": "\u05E4", # pe "y": "\u05E5", # tsadi final "Y": "\u05E6", # tsadi "Q": "\u05E7", # qof "R": "\u05E8", # resh "#": "\u05E9", # sin unpointed "T": "\u05EA", # tav "C": "\uFB2A", # shin pointed "F": "\uFB2B", # sin pointed "55": "<UNMAPPED 55=large letter>", # large_letter "56": "<UNMAPPED 56=small letter>", # small_letter "57": "<UNMAPPED 57=suspended letter>", # suspended_letter "-": "", # suppress space afterward "'": "\u05f3", # punctuation geresh '"': "\u05f4", # punctuation gershayim } """ Maps all ETCBC transliteration character combinations for Hebrew to Unicode. Example: sof-pasuq: Transcription.hebrew_mapping['00'] Output: ׃ """ hebrew_cons = ">BGDHWZXVJKLMNS<PYQRFCT" trans_final_pat = re.compile( r"([" + hebrew_cons + r"][^_&]*)([KMNPY])([^" + hebrew_cons + r"_&]*(:?[_&]|\Z))" ) trans_hebrew_pat = re.compile(r"(:[AE@]|.[cf]|:|[0-9][0-9]|.)") swap_accent_pat = re.compile( r"(\A|[_&])([0-9][0-9])([" + hebrew_cons + r"])([:;,.EAIOU@]*)" ) remove_accent_pat = re.compile(r"((?:[0-9][0-9])|[,*])") remove_point_pat = re.compile( r"((?:[0-9][0-9])|(?:\.[cf])|(?::[@AE])|[,.:;@AEIOU*])" ) remove_psn_pat = re.compile(r"00[ _SPNÑñ]*") remove_psq_pat = re.compile(r"(?:[ _]+05[ _]*)|(?:05[ _]+)") shin_pat = re.compile(r"[CF]") ph_simple_pat = re.compile(r"([ˈˌᵊᵃᵒᵉāo*])") noorigspace = re.compile( r""" (?: [&-]\Z) # space, maqef or nospace | (?: 0[05] # sof pasuq or paseq (?:_[SNP])* # nun hafukha, setumah, petuhah at end of verse \Z ) | (?:_[SPN])+ # nun hafukha, setumah, petuhah between words """, re.X, ) syriac_mapping_simple = { ">": "\u0710", # alaph "B": "\u0712", # beth "G": "\u0713", # gamal "D": "\u0715", # dalat "H": "\u0717", # he "W": "\u0718", # waw "Z": "\u0719", # zain "X": "\u071A", # heth "V": "\u071B", # teth "J": "\u071D", # yudh "K": "\u071F", # kaph "L": "\u0720", # lamadh "M": "\u0721", # mim "N": "\u0722", # nun "S": "\u0723", # semkath "<": "\u0725", # e "P": "\u0726", # pe "Y": "\u0728", # sadhe "Q": "\u0729", # qaph "R": "\u072A", # rish "C": "\u072B", # shin "T": "\u072C", # taw "s": "\u0724", # semkath final "p": "\u0727", # pe reversed } syriac_mapping_pil = { # LETTERS "'": "\u0710", # alaph "b": "\u0712", # beth "g": "\u0713", # gamal "d": "\u0715", # dalat "h": "\u0717", # he "w": "\u0718", # waw "z": "\u0719", # zain "H": "\u071A", # heth "T": "\u071B", # teth "y": "\u071D", # yod "k": "\u071F", # kaf "l": "\u0720", # lamad "m": "\u0721", # mim "n": "\u0722", # nun "s": "\u0723", # semkath "`": "\u0725", # 'e "p": "\u0726", # pe "S": "\u0728", # tsade "q": "\u0729", # qof "r": "\u072A", # resh "$": "\u072B", # shin "t": "\u072C", # taw # WORD-BOUND DIACRITICS '"': "\u0308", # seyame "#": "\u0323", # diacritical dot below "^": "\u0307", # diacritical dot above "~": "\u0307", # abbreviation mark # NON-VOCALIC LETTER-BOUND DIACRITICS "#,": "\u0742", # rukkakha '#"': "\u0342", # unclear (COMBINING DIAERESIS BELOW) "#!": "\u0744", # unclear (SYRIAC TWO VERTICAL DOTS BELOW) "#_": "\u0331", # linea occultans infera "^,": "\u0741", # qushshaya "^!": "\u0743", # unclear (SYRIAC TWO VERTICAL DOTS ABOVE) "^_": "\u0304", # linea occultans supera # VOCALIC LETTER-BOUND DIACRITICS ":": "", # shewa "A": "\u0733", # qamets "A1": "\u0734", # zeqapa "A2": "\u0735", # zeqofo "E": "\u0739", # tsere, revasa karya "O": "\u073F", # holem, rewaha "a": "\u0730", # patah "a1": "\u0731", # petaha "a2": "\u0732", # petoho "e": "\u0736", # segol "e1": "\u0737", # revasa arrika "e2": "\u0738", # revoso "i": "\u073A", # hireq "i1": "\u073B", # hevoso "y#": "\u071D\u073C", # hevasa "u": "\u073D", # qubbuts "u1": "\u073E", # esoso "w#": "\u0718\u073C", # esasa allisa "w^": "\u0718\u073F", # esasa rewiha # INTERPUNCTION "#.": "\u0702", # menachta, meshalyana (ES), metdamrana, samka "#:": "\u0704", # metkashpana (ES) "#\\": "\u0709", # tahtaya, metkashpana (WS), meshalyana (WS) "=.": "\u002E", # pasuqa "=/": "\u0707", # elaya "=:": "\u003A", # shewaya (WS), zauga (ES) "=\\": "\u0706", # unclear (SYRIAC COLON SKEWED LEFT) "^.": "\u0701", # paquda, metkashpana (ES), meshalyana (ES), etsyana, meshalana? '^"': "\u0705", # rahta "^:": "\u0703", # taksa (WS), zauga elaya (ES) "^\\": "\u0708", # unclear (SYRIAC SUPRALINEAR COLON SKEWED LEFT) # PERICOPE MARKERS "*": "\u0700", # rosette ".": "\u00B7", # common dot in caesuras "@": "\u2722", # vignette "_": "\u2014", # dash in caesuras "o": "\u2022", # large dot in caesuras } syriac_mapping = { # this is WIT # LETTERS ">": "\u0710", # alaph "B": "\u0712", # beth "G": "\u0713", # gamal "D": "\u0715", # dalat "H": "\u0717", # he "W": "\u0718", # waw "Z": "\u0719", # zain "X": "\u071A", # heth "V": "\u071B", # teth "J": "\u071D", # yod "K": "\u071F", # kaf "L": "\u0720", # lamad "M": "\u0721", # mim "N": "\u0722", # nun "S": "\u0723", # semkath "<": "\u0725", # 'e "P": "\u0726", # pe "Y": "\u0728", # tsade "Q": "\u0729", # qof "R": "\u072A", # resh "C": "\u072B", # shin "T": "\u072C", # taw # WORD-BOUND DIACRITICS '"': "\u0308", # seyame "#": "\u0323", # diacritical dot below "^": "\u0307", # diacritical dot above # NON-VOCALIC LETTER-BOUND DIACRITICS "^!": "\u0743", # unclear (SYRIAC TWO VERTICAL DOTS ABOVE) # VOCALIC LETTER-BOUND DIACRITICS ":": "", # shewa "A": "\u0733", # qamets "A1": "\u0734", # zeqapa "A2": "\u0735", # zeqofo "O": "\u073F", # holem, rewaha "@": "\u0730", # patah "@1": "\u0731", # petaha "@2": "\u0732", # petoho "E": "\u0736", # segol "E1": "\u0737", # revasa arrika "E2": "\u0738", # revoso "I": "\u073A", # hireq "I1": "\u073B", # hevoso "U": "\u073D", # qubbuts "U1": "\u073E", # esoso # INTERPUNCTION "#\\": "\u0709", # tahtaya, metkashpana (WS), meshalyana (WS) "=.": "\u002E", # pasuqa "=#": "\u0707", # elaya "=:": "\u003A", # shewaya (WS), zauga (ES) "=^": "\u0706", # unclear (SYRIAC COLON SKEWED LEFT) "=/": "\u0707", # elaya "=\\": "\u0706", # unclear (SYRIAC COLON SKEWED LEFT) "^:": "\u0703", # taksa (WS), zauga elaya (ES) "^\\": "\u0708", # unclear (SYRIAC SUPRALINEAR COLON SKEWED LEFT) # PERICOPE MARKERS "*": "\u0700", # rosette ".": "\u00B7", # common dot in caesuras "_": "\u2014", # dash in caesuras "o": "\u2022", # large dot in caesuras } """ Maps all ETCBC transliteration character combinations for Syriac to Unicode. Example: semkath-final: Transcription.syriac_mapping['s'] Output: ܤ """ trans_syriac_pat = re.compile(r"([AE@IU][12]?|=[.#:\^/\\]|[\^#][!:\\]|.)") arabic_mapping = { " ": "\u0020", # SPACE "'": "\u0621", # ARABIC LETTER HAMZA ">": "\u0623", # ARABIC LETTER ALEF WITH HAMZA ABOVE "&": "\u0624", # ARABIC LETTER WAW WITH HAMZA ABOVE "<": "\u0625", # ARABIC LETTER ALEF WITH HAMZA BELOW "}": "\u0626", # ARABIC LETTER YEH WITH HAMZA ABOVE "A": "\u0627", # ARABIC LETTER ALEF "b": "\u0628", # ARABIC LETTER BEH "p": "\u0629", # ARABIC LETTER TEH MARBUTA "t": "\u062a", # ARABIC LETTER TEH "v": "\u062b", # ARABIC LETTER THEH "j": "\u062c", # ARABIC LETTER JEEM "H": "\u062d", # ARABIC LETTER HAH "x": "\u062e", # ARABIC LETTER KHAH "d": "\u062f", # ARABIC LETTER DAL "*": "\u0630", # ARABIC LETTER THAL "r": "\u0631", # ARABIC LETTER REH "z": "\u0632", # ARABIC LETTER ZAIN "s": "\u0633", # ARABIC LETTER SEEN "$": "\u0634", # ARABIC LETTER SHEEN "S": "\u0635", # ARABIC LETTER SAD "D": "\u0636", # ARABIC LETTER DAD "T": "\u0637", # ARABIC LETTER TAH "Z": "\u0638", # ARABIC LETTER ZAH "E": "\u0639", # ARABIC LETTER AIN "g": "\u063a", # ARABIC LETTER GHAIN "_": "\u0640", # ARABIC TATWEEL "f": "\u0641", # ARABIC LETTER FEH "q": "\u0642", # ARABIC LETTER QAF "k": "\u0643", # ARABIC LETTER KAF "l": "\u0644", # ARABIC LETTER LAM "m": "\u0645", # ARABIC LETTER MEEM "n": "\u0646", # ARABIC LETTER NOON "h": "\u0647", # ARABIC LETTER HEH "w": "\u0648", # ARABIC LETTER WAW "Y": "\u0649", # ARABIC LETTER ALEF MAKSURA "y": "\u064a", # ARABIC LETTER YEH "F": "\u064b", # ARABIC FATHATAN "N": "\u064c", # ARABIC DAMMATAN "K": "\u064d", # ARABIC KASRATAN "a": "\u064e", # ARABIC FATHA "u": "\u064f", # ARABIC DAMMA "i": "\u0650", # ARABIC KASRA "~": "\u0651", # ARABIC SHADDA "o": "\u0652", # ARABIC SUKUN "^": "\u0653", # ARABIC MADDAH ABOVE "#": "\u0654", # ARABIC HAMZA ABOVE "`": "\u0670", # ARABIC LETTER SUPERSCRIPT ALEF "{": "\u0671", # ARABIC LETTER ALEF WASLA ":": "\u06dc", # ARABIC SMALL HIGH SEEN "@": "\u06df", # ARABIC SMALL HIGH ROUNDED ZERO '"': "\u06e0", # ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO "[": "\u06e2", # ARABIC SMALL HIGH MEEM ISOLATED FORM ";": "\u06e3", # ARABIC SMALL LOW SEEN ",": "\u06e5", # ARABIC SMALL WAW ".": "\u06e6", # ARABIC SMALL YEH "!": "\u06e8", # ARABIC SMALL HIGH NOON "-": "\u06ea", # ARABIC EMPTY CENTRE LOW STOP "+": "\u06eb", # ARABIC EMPTY CENTRE HIGH STOP "%": "\u06ec", # ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE "]": "\u06ed", # ARABIC SMALL LOW MEEM } """ Maps an Arabic transliteration character to Unicode. This is the mapping used in the Quran representation on tanzil.net. Example: beh Transcription.syriac_mapping['b'] Output: ب Maps an Arabic letter in unicode to its transliteration Example: beh transliteration Transcription.syriac_mapping['ب'] Output: b """ arabic_mappingi = {v: k for (k, v) in arabic_mapping.items()} arabicTrans = { "\u0020": (" ", " ", " ", " "), # SPACE "\u060c": ("‚", ",", ",", ","), # ARABIC COMMA "\u061b": ("„", ";", ";", ";"), # ARABIC SEMICOLON "\u061f": ("?", "?", "?", "?"), # ARABIC QUESTION MARK "\u0621": ("'", "'", "ʾ", "'"), # ARABIC LETTER HAMZA "\u0622": ("»", "_a", "ā", "ā"), # ARABIC LETTER ALEF WITH MADDA ABOVE "\u0623": (">", "_a", "ā", "ā"), # ARABIC LETTER ALEF WITH HAMZA ABOVE "\u0624": ("&", "'", "ʾ", "'"), # ARABIC LETTER WAW WITH HAMZA ABOVE "\u0625": ("<", "_a", "ā", "ā"), # ARABIC LETTER ALEF WITH HAMZA BELOW "\u0626": ("}", "'", "ʾ", "y"), # ARABIC LETTER YEH WITH HAMZA ABOVE "\u0627": ("A", "_a", "ā", "ā"), # ARABIC LETTER ALEF "\u0628": ("b", "b", "b", "b"), # ARABIC LETTER BEH "\u0629": ("p", "=t", "ŧ", "t"), # ARABIC LETTER TEH MARBUTA "\u062a": ("t", "t", "t", "t"), # ARABIC LETTER TEH "\u062b": ("v", "_t", "ṯ", "th"), # ARABIC LETTER THEH "\u062c": ("j", "j", "ǧ", "j"), # ARABIC LETTER JEEM "\u062d": ("H", "*h", "ḥ", "ḥ"), # ARABIC LETTER HAH "\u062e": ("x", "_h", "ḫ", "kh"), # ARABIC LETTER KHAH "\u062f": ("d", "d", "d", "d"), # ARABIC LETTER DAL "\u0630": ("*", "_d", "ḏ", "dh"), # ARABIC LETTER THAL "\u0631": ("r", "r", "r", "r"), # ARABIC LETTER REH "\u0632": ("z", "z", "z", "z"), # ARABIC LETTER ZAIN "\u0633": ("s", "s", "s", "s"), # ARABIC LETTER SEEN "\u0634": ("$", "^s", "š", "sh"), # ARABIC LETTER SHEEN "\u0635": ("S", "*s", "ṣ", "ṣ"), # ARABIC LETTER SAD "\u0636": ("D", "*d", "ḍ", "ḍ"), # ARABIC LETTER DAD "\u0637": ("T", "*t", "ṭ", "ṭ"), # ARABIC LETTER TAH "\u0638": ("Z", "*z", "ẓ", "ẓ"), # ARABIC LETTER ZAH "\u0639": ("E", "`", "ʿ", "`"), # ARABIC LETTER AIN "\u063a": ("g", "*g", "ġ", "gh"), # ARABIC LETTER GHAIN "\u0640": ("_", "", "", ""), # ARABIC TATWEEL "\u0641": ("f", "f", "f", "f"), # ARABIC LETTER FEH "\u0642": ("q", "*k", "ḳ", "q"), # ARABIC LETTER QAF "\u0643": ("k", "k", "k", "k"), # ARABIC LETTER KAF "\u0644": ("l", "l", "l", "l"), # ARABIC LETTER LAM "\u0645": ("m", "m", "m", "m"), # ARABIC LETTER MEEM "\u0646": ("n", "n", "n", "n"), # ARABIC LETTER NOON "\u0647": ("h", "h", "h", "h"), # ARABIC LETTER HEH "\u0648": ("w", "w", "w", "w"), # ARABIC LETTER WAW "\u0649": ("Y", "/a", "á", "ā"), # ARABIC LETTER ALEF MAKSURA "\u064a": ("y", "y", "y", "y"), # ARABIC LETTER YEH "\u064b": ("F", "a*n", "aⁿ", "an"), # ARABIC FATHATAN "\u064c": ("N", "u*n", "uⁿ", "un"), # ARABIC DAMMATAN "\u064d": ("K", "i*n", "iⁿ", "in"), # ARABIC KASRATAN "\u064e": ("a", "a", "a", "a"), # ARABIC FATHA "\u064f": ("u", "u", "u", "u"), # ARABIC DAMMA "\u0650": ("i", "i", "i", "i"), # ARABIC KASRA "\u0651": ("~", "u", "u", "ūw"), # ARABIC SHADDA "\u0652": ("o", "a", "a", "a"), # ARABIC SUKUN "\u0653": ("^", "_a", "ā", "ā"), # ARABIC MADDAH ABOVE "\u0654": ("#", "'", "ʾ", "ā"), # ARABIC HAMZA ABOVE "\u0655": ("=", "'", "ʾ", "ā"), # ARABIC HAMZA BELOW "\u0660": ("0", "0", "0", "0"), # ARABIC INDIC DIGIT ZERO "\u0661": ("1", "1", "1", "1"), # ARABIC INDIC DIGIT ONE "\u0662": ("2", "2", "2", "2"), # ARABIC INDIC DIGIT TWO "\u0663": ("3", "3", "3", "3"), # ARABIC INDIC DIGIT THREE "\u0664": ("4", "4", "4", "4"), # ARABIC INDIC DIGIT FOUR "\u0665": ("5", "5", "5", "5"), # ARABIC INDIC DIGIT FIVE "\u0666": ("6", "6", "6", "6"), # ARABIC INDIC DIGIT SIX "\u0667": ("7", "7", "7", "7"), # ARABIC INDIC DIGIT SEVEN "\u0668": ("8", "8", "8", "8"), # ARABIC INDIC DIGIT EIGHT "\u0669": ("9", "9", "9", "9"), # ARABIC INDIC DIGIT NINE "\u0670": ("`", "~a", "ã", ""), # ARABIC LETTER SUPERSCRIPT ALEF "\u0671": ("{", "a", "a", "a"), # ARABIC LETTER ALEF WASLA "\u06af": ("G", "g", "g", "g"), # ARABIC LETTER GAF "\u06cc": ("J", "#y", "Y", "y"), # ARABIC LETTER FARSI YEH "\u06d6": ("SlY", "*sl/a", "ṣlá", "ṣla"), # ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA "\u06da": ("M", "#j", "J", "j"), # ARABIC SMALL HIGH JEEM "\u06dc": (":", "#s", "S", "s"), # ARABIC SMALL HIGH SEEN "\u06df": ("@", "0", "0", "0"), # ARABIC SMALL HIGH ROUNDED ZERO "\u06e0": ('"', "0", "0", "0"), # ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO "\u06e2": ("[", "#m", "M", "M"), # ARABIC SMALL HIGH MEEM ISOLATED FORM "\u06e3": (";", "#s", "S", "S"), # ARABIC SMALL LOW SEEN "\u06e5": (",", "#w", "W", "W"), # ARABIC SMALL WAW "\u06e6": (".", "#y", "Y", "Y"), # ARABIC SMALL YEH "\u06e8": ("!", "#n", "N", "N"), # ARABIC SMALL HIGH NOON "\u06ea": ("-", ".", ".", "."), # ARABIC EMPTY CENTRE LOW STOP "\u06eb": ("+", ".", ".", "."), # ARABIC EMPTY CENTRE HIGH STOP "\u06ec": ("%", ".", ".", "."), # ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE "\u06ed": ("]", "#m", "M", "M"), # ARABIC SMALL LOW MEEM } """ More Arabic transcriptions: * column 1: `custom` [Quran-tanzil](http://tanzil.net/#1:1), slightly extended * column 2/: `ascii` resp. `latin` plus diacritics also known as betacode. We use a list compiled by [Peter Verkinderen](https://pverkind.github.io/betacodeTranscriber/js/betacode.js) * column 4: `standard` (Library of Congress) (to-be filled). We use the [arabic romanization list of 2012](https://www.loc.gov/catdir/cpso/romanization/arabic.pdf) We refrain of from applying rules that cannot be computed without lexical/grammatical/dialectical knowledge of the arabic language. """ arabicTransQuran = {} arabicTransAscii = {} arabicTransLatin = {} arabicTransStandard = {} for (ara, (qur, asc, lat, std)) in arabicTrans.items(): arabicTransQuran[ara] = qur arabicTransAscii[ara] = asc arabicTransLatin[ara] = lat arabicTransStandard[ara] = std def quranFromArabic(word): return "".join(Transcription.arabicTransQuran.get(x, x) for x in word) def asciiFromArabic(word): return "".join(Transcription.arabicTransAscii.get(x, x) for x in word) def latinFromArabic(word): return "".join(Transcription.arabicTransLatin.get(x, x) for x in word) def standardFromArabic(word): return "".join(Transcription.arabicTransStandard.get(x, x) for x in word) def __init__(self): self.hebrew_consonants = { Transcription.hebrew_mapping[x] for x in Transcription.hebrew_cons } self.hebrew_consonants.add("\u05E9") self.hebrew_mappingi = dict( (v, k) for (k, v) in Transcription.hebrew_mapping.items() if k != "" ) self.syriac_mappingi = dict( (v, k) for (k, v) in Transcription.syriac_mapping.items() if k != "" ) # special treatment needed for nun hafukha, # since it is consists of two characters self.hebrew_mappingi["\u05C6"] = "ñ" self.hebrew_mappingi["\u0307"] = "" self.syriac_punctuation_trans = ( "#\\", "=.", "=#", "=:", "=^", "=/", "=\\", "^:", "^\\", ) self.syriac_punctuation_syc = tuple( Transcription.syriac_mapping[c] for c in self.syriac_punctuation_trans ) arabicMap = dict(qur={}, asc={}, lat={}, std={}) arabicMapI = dict(qur={}, asc={}, lat={}, std={}) for u in sorted(Transcription.arabicTrans): (qur, asc, lat, std) = Transcription.arabicTrans[u] for (k, v) in zip(arabicMap.keys(), Transcription.arabicTrans[u]): arabicMap[k].setdefault(v, u) arabicMapI[k][u] = v self.arabicMap = arabicMap self.arabicMapI = arabicMapI def sycSplitPunc(self): pass def _comp(s): for (d, c) in Transcription.decomp.items(): s = s.replace(d, c) return s def _decomp(s): for (d, c) in Transcription.decomp.items(): s = s.replace(c, d) return s def suffix_and_finales(word): """ Given an ETCBC transliteration, split it into the word material and the interword material that follows it (space, punctuation). Replace the last consonant of the word material by its final form, if applicable. Output a tuple with the modified word material and the interword material. Example: Transcription.suffix_and_finales('71T_H@>@95REY00') Output: ('71T_H@>@95REy', '00\n') Note that the `Y` has been replaced by `y`. """ # first split the word proper from the suffix, # and add a space if there is no other suffix add_space = "" suffix = "" new_word = word if not word: return (new_word, suffix + add_space) lastch = new_word[-1] if lastch == "-" or lastch == "&": new_word = new_word[0:-1] suffix = lastch else: if len(new_word) >= 2: lastch = new_word[-1] llastch = new_word[-2] if llastch == "_" and (lastch == "P" or lastch == "S"): new_word = new_word[0:-2] suffix = " " + lastch + suffix + " " if len(new_word) >= 2: lastch = new_word[-1] llastch = new_word[-2] if llastch == "_" and (lastch == "N"): new_word = new_word[0:-2] suffix = " ñ" + suffix + " " if len(new_word) >= 2: lastch = new_word[-1] llastch = new_word[-2] if llastch == "0" and (lastch == "0" or lastch == "5"): new_word = new_word[0:-2] suffix = (" " if lastch == "5" else "") + llastch + lastch + suffix add_space = "\n" if lastch == "0" else " " if suffix == "": add_space = " " elif suffix == "-": add_space = "" suffix = "" # second: replace consonants by their final forms when needed new_word = Transcription.trans_final_pat.sub(Transcription._map_final, new_word) return (new_word, suffix + add_space) def _map_final(m): return m.group(1) + m.group(2).lower() + m.group(3) def _map_hebrew(m): return Transcription.hebrew_mapping.get(m.group(1), m.group(1)) def _map_syriac(m): return Transcription.syriac_mapping.get(m.group(1), m.group(1)) def _swap_accent(m): return m.group(1) + m.group(3) + m.group(4) + m.group(2) def _remove_accent(m): return "00" if m.group(1) == "00" else "05" if m.group(1) == "05" else "" def _remove_point(m): return "00" if m.group(1) == "00" else "05" if m.group(1) == "05" else "" def _ph_simple(m): return "å" if m.group(1) in "āo" else "" # unicode normalization is harmful # if there is a combination of dagesh, vowel and accent. def suppress_space(word): """ Given an ETCBC transliteration of a word, match the end of the word for interpunction and spacing characters (sof pasuq, paseq, nun hafukha, setumah, petuhah, space, no-space) Example: Transcription.suppress_space('B.:&') Transcription.suppress_space('B.@R@74>') Transcription.suppress_space('71T_H@>@95REY00') Output: <re.Match object; span=(3, 4), match='&'> None <re.Match object; span=(13, 15), match='00'> """ return Transcription.noorigspace.search(word) def to_etcbc_v(word): """ Given an ETCBC transliteration of a fully pointed word, strip all the non-vowel pointing (i.e. the accents). Example: Transcription.to_etcbc_v('HAC.@MA73JIm') Output: HAC.@MAJIm """ return Transcription.remove_accent_pat.sub(Transcription._remove_accent, word) def to_etcbc_c(word): """ Given an ETCBC transliteration of a fully pointed word, strip everything except the consonants. Punctuation will also be stripped. Example: Transcription.to_etcbc_c('HAC.@MA73JIm') Output: H#MJM Note that the pointed shin (`C`) is replaced by an unpointed one (`#`). """ word = Transcription.remove_point_pat.sub(Transcription._remove_point, word) word = Transcription.remove_psn_pat.sub( "00", word ) # remove nun hafukha, setumah, petuhah at the end of a verse word = Transcription.remove_psq_pat.sub( " ", word ) # replace paseq with attached spaces by single space word = word.upper() # no final forms of consonants return Transcription.shin_pat.sub("#", word) def to_hebrew(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew. Care will be taken that vowel pointing will be added to consonants before accent pointing. Example: Transcription.to_hebrew('HAC.@MA73JIm') Output: הַשָּׁמַ֖יִם """ word = Transcription.swap_accent_pat.sub(Transcription._swap_accent, word) return Transcription.trans_hebrew_pat.sub(Transcription._map_hebrew, word) def to_hebrew_v(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the accents. Example: Transcription.to_hebrew_v('HAC.@MA73JIm') Output: הַשָּׁמַיִם """ return Transcription.trans_hebrew_pat.sub( Transcription._map_hebrew, Transcription.to_etcbc_v(word) ) def to_hebrew_c(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the pointing. Example: Transcription.to_hebrew_c('HAC.@MA73JIm') Output: השמימ Note that final consonant forms are not being used. """ return Transcription.trans_hebrew_pat.sub( Transcription._map_hebrew, Transcription.to_etcbc_c(word) ) def to_hebrew_x(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the pointing. Vowel pointing and accent pointing will be applied in the order given by the input word. Example: Transcription.to_hebrew_x('HAC.@MA73JIm') Output: הַשָּׁמַ֖יִם """ return Transcription.trans_hebrew_pat.sub(Transcription._map_hebrew, word) def ph_simplify(pword): """ Given a phonological transliteration of a fully pointed word, produce a more coarse phonological transliteration. Example: Transcription.ph_simplify('ʔᵉlōhˈîm') Transcription.ph_simplify('māqˈôm') Transcription.ph_simplify('kol') Output: ʔlōhîm måqôm kål Note that the simplified version transliterates the qamets gadol and qatan to the same character. """ return Transcription.ph_simple_pat.sub(Transcription._ph_simple, pword) def from_hebrew(self, word): """ Given a fully pointed word in Unicode Hebrew, produce the word in ETCBC transliteration. Example: tr.from_hebrew('הָאָֽרֶץ׃') Output: H@>@95REy00 """ return "".join( self.hebrew_mappingi.get(x, x) for x in Transcription._comp(word) ) def to_syriac(self, word): """ Given a word in ETCBC transliteration, produce the word in Unicode Syriac. Example: tr.to_syriac('MKSJN') Output: ܡܟܣܝܢ """ return Transcription.trans_syriac_pat.sub(Transcription._map_syriac, word) def from_syriac(self, word): """ Given a word in Unicode Syriac, produce the word in ETCBC transliteration. Example: tr.from_syriac('ܡܟܣܝܢ') Output: MKSJN """ return "".join(self.syriac_mappingi.get(x, x) for x in word) def can_to_syriac(self, word): return all( candidate in Transcription.syriac_mapping for candidate in Transcription.trans_syriac_pat.findall(word) if candidate != " " ) def can_from_syriac(self, word): return all(c in self.syriac_mappingi for c in word if c != " ") def to_arabic(word): """ Given a word in transliteration, produce the word in Unicode Arabic. Example: tr.to_arabic('bisomi') Output: بِسْمِ """ return "".join(Transcription.arabic_mapping.get(x, x) for x in word) def from_arabic(word): """ Given a word in Unicode Arabic, produce the word in transliteration. Example: tr.from_arabic('بِسْمِ') Output: bisomi """ return "".join(Transcription.arabic_mappingi.get(x, x) for x in word)
Class variables
var ara
var arabicTrans
-
More Arabic transcriptions:
- column 1:
custom
Quran-tanzil, slightly extended - column 2/:
ascii
resp.latin
plus diacritics also known as betacode. We use a list compiled by Peter Verkinderen - column 4:
standard
(Library of Congress) (to-be filled). We use the arabic romanization list of 2012 We refrain of from applying rules that cannot be computed without lexical/grammatical/dialectical knowledge of the arabic language.
- column 1:
var arabicTransAscii
var arabicTransLatin
var arabicTransQuran
var arabicTransStandard
var arabic_mapping
-
Maps an Arabic transliteration character to Unicode. This is the mapping used in the Quran representation on tanzil.net.
Example: beh
Transcription.syriac_mapping['b']
Output
ب
Maps an Arabic letter in unicode to its transliteration
Example: beh transliteration
Transcription.syriac_mapping['ب']
Output
b
var arabic_mappingi
var asc
var decomp
var hebrew_cons
var hebrew_mapping
-
Maps all ETCBC transliteration character combinations for Hebrew to Unicode.
Example: sof-pasuq:
Transcription.hebrew_mapping['00']
Output
׃
var lat
var noorigspace
var ph_simple_pat
var qur
var remove_accent_pat
var remove_point_pat
var remove_psn_pat
var remove_psq_pat
var shin_pat
var std
var swap_accent_pat
var syriac_mapping
-
Maps all ETCBC transliteration character combinations for Syriac to Unicode.
Example: semkath-final:
Transcription.syriac_mapping['s']
Output
ܤ
var syriac_mapping_pil
var syriac_mapping_simple
var trans_final_pat
var trans_hebrew_pat
var trans_syriac_pat
Methods
def asciiFromArabic(word)
-
Expand source code Browse git
def asciiFromArabic(word): return "".join(Transcription.arabicTransAscii.get(x, x) for x in word)
def can_from_syriac(self, word)
-
Expand source code Browse git
def can_from_syriac(self, word): return all(c in self.syriac_mappingi for c in word if c != " ")
def can_to_syriac(self, word)
-
Expand source code Browse git
def can_to_syriac(self, word): return all( candidate in Transcription.syriac_mapping for candidate in Transcription.trans_syriac_pat.findall(word) if candidate != " " )
def from_arabic(word)
-
Given a word in Unicode Arabic, produce the word in transliteration.
Example
tr.from_arabic('بِسْمِ')
Output
bisomi
Expand source code Browse git
def from_arabic(word): """ Given a word in Unicode Arabic, produce the word in transliteration. Example: tr.from_arabic('بِسْمِ') Output: bisomi """ return "".join(Transcription.arabic_mappingi.get(x, x) for x in word)
def from_hebrew(self, word)
-
Given a fully pointed word in Unicode Hebrew, produce the word in ETCBC transliteration.
Example
tr.from_hebrew('הָאָֽרֶץ׃')
Output
H@>@95REy00
Expand source code Browse git
def from_hebrew(self, word): """ Given a fully pointed word in Unicode Hebrew, produce the word in ETCBC transliteration. Example: tr.from_hebrew('הָאָֽרֶץ׃') Output: H@>@95REy00 """ return "".join( self.hebrew_mappingi.get(x, x) for x in Transcription._comp(word) )
def from_syriac(self, word)
-
Given a word in Unicode Syriac, produce the word in ETCBC transliteration.
Example
tr.from_syriac('ܡܟܣܝܢ')
Output
MKSJN
Expand source code Browse git
def from_syriac(self, word): """ Given a word in Unicode Syriac, produce the word in ETCBC transliteration. Example: tr.from_syriac('ܡܟܣܝܢ') Output: MKSJN """ return "".join(self.syriac_mappingi.get(x, x) for x in word)
def latinFromArabic(word)
-
Expand source code Browse git
def latinFromArabic(word): return "".join(Transcription.arabicTransLatin.get(x, x) for x in word)
def ph_simplify(pword)
-
Given a phonological transliteration of a fully pointed word, produce a more coarse phonological transliteration.
Example
Transcription.ph_simplify('ʔᵉlōhˈîm') Transcription.ph_simplify('māqˈôm') Transcription.ph_simplify('kol')
Output
ʔlōhîm måqôm kål
Note that the simplified version transliterates the qamets gadol and qatan to the same character.
Expand source code Browse git
def ph_simplify(pword): """ Given a phonological transliteration of a fully pointed word, produce a more coarse phonological transliteration. Example: Transcription.ph_simplify('ʔᵉlōhˈîm') Transcription.ph_simplify('māqˈôm') Transcription.ph_simplify('kol') Output: ʔlōhîm måqôm kål Note that the simplified version transliterates the qamets gadol and qatan to the same character. """ return Transcription.ph_simple_pat.sub(Transcription._ph_simple, pword)
def quranFromArabic(word)
-
Expand source code Browse git
def quranFromArabic(word): return "".join(Transcription.arabicTransQuran.get(x, x) for x in word)
def standardFromArabic(word)
-
Expand source code Browse git
def standardFromArabic(word): return "".join(Transcription.arabicTransStandard.get(x, x) for x in word)
def suffix_and_finales(word)
-
Given an ETCBC transliteration, split it into the word material and the interword material that follows it (space, punctuation). Replace the last consonant of the word material by its final form, if applicable.
Output a tuple with the modified word material and the interword material. Example: Transcription.suffix_and_finales('71T_H@>@95REY00') Output: ('71T_H@>@95REy', '00
')
Note that the <code>Y</code> has been replaced by <code>y</code>.
Expand source code Browse git
def suffix_and_finales(word): """ Given an ETCBC transliteration, split it into the word material and the interword material that follows it (space, punctuation). Replace the last consonant of the word material by its final form, if applicable. Output a tuple with the modified word material and the interword material. Example: Transcription.suffix_and_finales('71T_H@>@95REY00') Output: ('71T_H@>@95REy', '00\n') Note that the `Y` has been replaced by `y`. """ # first split the word proper from the suffix, # and add a space if there is no other suffix add_space = "" suffix = "" new_word = word if not word: return (new_word, suffix + add_space) lastch = new_word[-1] if lastch == "-" or lastch == "&": new_word = new_word[0:-1] suffix = lastch else: if len(new_word) >= 2: lastch = new_word[-1] llastch = new_word[-2] if llastch == "_" and (lastch == "P" or lastch == "S"): new_word = new_word[0:-2] suffix = " " + lastch + suffix + " " if len(new_word) >= 2: lastch = new_word[-1] llastch = new_word[-2] if llastch == "_" and (lastch == "N"): new_word = new_word[0:-2] suffix = " ñ" + suffix + " " if len(new_word) >= 2: lastch = new_word[-1] llastch = new_word[-2] if llastch == "0" and (lastch == "0" or lastch == "5"): new_word = new_word[0:-2] suffix = (" " if lastch == "5" else "") + llastch + lastch + suffix add_space = "\n" if lastch == "0" else " " if suffix == "": add_space = " " elif suffix == "-": add_space = "" suffix = "" # second: replace consonants by their final forms when needed new_word = Transcription.trans_final_pat.sub(Transcription._map_final, new_word) return (new_word, suffix + add_space)
def suppress_space(word)
-
Given an ETCBC transliteration of a word, match the end of the word for interpunction and spacing characters (sof pasuq, paseq, nun hafukha, setumah, petuhah, space, no-space)
Example
Transcription.suppress_space('B.:&') Transcription.suppress_space('B.@R@74>') Transcription.suppress_space('71T_H@>@95REY00')
Output
None Expand source code Browse git
def suppress_space(word): """ Given an ETCBC transliteration of a word, match the end of the word for interpunction and spacing characters (sof pasuq, paseq, nun hafukha, setumah, petuhah, space, no-space) Example: Transcription.suppress_space('B.:&') Transcription.suppress_space('B.@R@74>') Transcription.suppress_space('71T_H@>@95REY00') Output: <re.Match object; span=(3, 4), match='&'> None <re.Match object; span=(13, 15), match='00'> """ return Transcription.noorigspace.search(word)
def sycSplitPunc(self)
-
Expand source code Browse git
def sycSplitPunc(self): pass
def to_arabic(word)
-
Given a word in transliteration, produce the word in Unicode Arabic.
Example
tr.to_arabic('bisomi')
Output
بِسْمِ
Expand source code Browse git
def to_arabic(word): """ Given a word in transliteration, produce the word in Unicode Arabic. Example: tr.to_arabic('bisomi') Output: بِسْمِ """ return "".join(Transcription.arabic_mapping.get(x, x) for x in word)
def to_etcbc_c(word)
-
Given an ETCBC transliteration of a fully pointed word, strip everything except the consonants. Punctuation will also be stripped.
Example
Transcription.to_etcbc_c('HAC.@MA73JIm')
Output
H#MJM
Note that the pointed shin (
C
) is replaced by an unpointed one (#
).Expand source code Browse git
def to_etcbc_c(word): """ Given an ETCBC transliteration of a fully pointed word, strip everything except the consonants. Punctuation will also be stripped. Example: Transcription.to_etcbc_c('HAC.@MA73JIm') Output: H#MJM Note that the pointed shin (`C`) is replaced by an unpointed one (`#`). """ word = Transcription.remove_point_pat.sub(Transcription._remove_point, word) word = Transcription.remove_psn_pat.sub( "00", word ) # remove nun hafukha, setumah, petuhah at the end of a verse word = Transcription.remove_psq_pat.sub( " ", word ) # replace paseq with attached spaces by single space word = word.upper() # no final forms of consonants return Transcription.shin_pat.sub("#", word)
def to_etcbc_v(word)
-
Given an ETCBC transliteration of a fully pointed word, strip all the non-vowel pointing (i.e. the accents).
Example
Transcription.to_etcbc_v('HAC.@MA73JIm')
Output
HAC.@MAJIm
Expand source code Browse git
def to_etcbc_v(word): """ Given an ETCBC transliteration of a fully pointed word, strip all the non-vowel pointing (i.e. the accents). Example: Transcription.to_etcbc_v('HAC.@MA73JIm') Output: HAC.@MAJIm """ return Transcription.remove_accent_pat.sub(Transcription._remove_accent, word)
def to_hebrew(word)
-
Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew. Care will be taken that vowel pointing will be added to consonants before accent pointing.
Example
Transcription.to_hebrew('HAC.@MA73JIm')
Output
הַשָּׁמַ֖יִם
Expand source code Browse git
def to_hebrew(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew. Care will be taken that vowel pointing will be added to consonants before accent pointing. Example: Transcription.to_hebrew('HAC.@MA73JIm') Output: הַשָּׁמַ֖יִם """ word = Transcription.swap_accent_pat.sub(Transcription._swap_accent, word) return Transcription.trans_hebrew_pat.sub(Transcription._map_hebrew, word)
def to_hebrew_c(word)
-
Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the pointing.
Example
Transcription.to_hebrew_c('HAC.@MA73JIm')
Output
השמימ
Note that final consonant forms are not being used.
Expand source code Browse git
def to_hebrew_c(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the pointing. Example: Transcription.to_hebrew_c('HAC.@MA73JIm') Output: השמימ Note that final consonant forms are not being used. """ return Transcription.trans_hebrew_pat.sub( Transcription._map_hebrew, Transcription.to_etcbc_c(word) )
def to_hebrew_v(word)
-
Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the accents.
Example
Transcription.to_hebrew_v('HAC.@MA73JIm')
Output
הַשָּׁמַיִם
Expand source code Browse git
def to_hebrew_v(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the accents. Example: Transcription.to_hebrew_v('HAC.@MA73JIm') Output: הַשָּׁמַיִם """ return Transcription.trans_hebrew_pat.sub( Transcription._map_hebrew, Transcription.to_etcbc_v(word) )
def to_hebrew_x(word)
-
Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the pointing. Vowel pointing and accent pointing will be applied in the order given by the input word.
Example
Transcription.to_hebrew_x('HAC.@MA73JIm')
Output
הַשָּׁמַ֖יִם
Expand source code Browse git
def to_hebrew_x(word): """ Given a transliteration of a fully pointed word, produce the word in Unicode Hebrew, but without the pointing. Vowel pointing and accent pointing will be applied in the order given by the input word. Example: Transcription.to_hebrew_x('HAC.@MA73JIm') Output: הַשָּׁמַ֖יִם """ return Transcription.trans_hebrew_pat.sub(Transcription._map_hebrew, word)
def to_syriac(self, word)
-
Given a word in ETCBC transliteration, produce the word in Unicode Syriac.
Example
tr.to_syriac('MKSJN')
Output
ܡܟܣܝܢ
Expand source code Browse git
def to_syriac(self, word): """ Given a word in ETCBC transliteration, produce the word in Unicode Syriac. Example: tr.to_syriac('MKSJN') Output: ܡܟܣܝܢ """ return Transcription.trans_syriac_pat.sub(Transcription._map_syriac, word)