31 from __future__
import print_function
32 from __future__
import unicode_literals
39 import xml.etree.ElementTree
as ET
42 if sys.version_info[0] <= 2:
44 sys.stdout = codecs.getwriter(
'utf8')(sys.stdout)
50 def bytes(source, encoding='ascii', errors='strict'):
51 return source.encode(encoding=encoding, errors=errors)
54 sys.stdout = codecs.getwriter(
'utf8')(sys.stdout.buffer)
60 PLAIN_LETTER_RANGES = ((ord(
'a'), ord(
'z')),
73 COMBINING_MARK_RANGES = ((0x0300, 0x0362),
79 output = chr(codepoint) +
"\t" + letter
81 output = chr(codepoint)
86 def __init__(self, id, general_category, combining_ids):
92 """Return true if this is a combining mark to remove.""" 96 for begin, end
in COMBINING_MARK_RANGES:
97 if codepoint.id >= begin
and codepoint.id <= end:
102 """Return true if codepoint represents a "plain letter".""" 103 for begin, end
in PLAIN_LETTER_RANGES:
104 if codepoint.id >= begin
and codepoint.id <= end:
109 """Returns true for diacritical marks (combining codepoints).""" 110 return codepoint.general_category
in (
"Mn",
"Me",
"Mc")
113 """Returns true for letters combined with one or more marks.""" 118 if len(codepoint.combining_ids) == 1:
122 if any(
is_mark(table[i])
for i
in codepoint.combining_ids[1:])
is False:
126 codepoint_base = codepoint.combining_ids[0]
134 """Return true for letter with or without diacritical marks.""" 138 """Return the base codepoint without marks. If this codepoint has more 139 than one combining character, do a recursive lookup on the table to 140 find out its plain base letter.""" 142 if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
145 return table[codepoint.combining_ids[0]]
156 """Return true for letters combined with letters.""" 157 return all(
is_letter(table[i], table)
for i
in codepoint.combining_ids)
160 """Return a list of plain letters from a ligature.""" 162 return [
get_plain_letter(table[id], table)
for id
in codepoint.combining_ids]
165 """Parse the XML file and return a set of tuples (src, trg), where "src" 166 is the original character and "trg" the substitute.""" 167 charactersSet = set()
170 rulePattern = re.compile(
r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
173 transliterationTree = ET.parse(latinAsciiFilePath)
174 transliterationTreeRoot = transliterationTree.getroot()
179 blockRules = transliterationTreeRoot.findall(
"./transforms/transform/tRule")
180 assert(len(blockRules) == 1)
183 rules = blockRules[0].text.splitlines()
187 matches = rulePattern.search(rule)
197 if matches
is not None:
198 src = matches.group(1)
if matches.group(1)
is not None else bytes(matches.group(2),
'UTF-8').decode(
'unicode-escape')
199 trg = matches.group(3)
if matches.group(3)
is not None else matches.group(4)
202 trg = trg.replace(
"\\'",
"'").replace(
'\\"',
'"')
206 if not src.isspace()
and not trg.isspace():
207 charactersSet.add((ord(src), trg))
212 """Returns the special cases which are not handled by other methods""" 213 charactersSet = set()
216 charactersSet.add((0x0401,
u"\u0415"))
217 charactersSet.add((0x0451,
u"\u0435"))
220 charactersSet.add((0x2103,
u"\xb0C"))
221 charactersSet.add((0x2109,
u"\xb0F"))
222 charactersSet.add((0x2117,
"(P)"))
228 decomposition_type_pattern = re.compile(
" *<[^>]*> *")
234 charactersSet = set()
238 args.unicodeDataFilePath, mode=
'r', encoding='UTF-8',
239 )
as unicodeDataFile:
241 for line
in unicodeDataFile:
242 fields = line.split(
";")
245 general_category = fields[2]
246 decomposition = fields[5]
247 decomposition = re.sub(decomposition_type_pattern,
' ', decomposition)
248 id = int(fields[0], 16)
249 combining_ids = [int(s, 16)
for s
in decomposition.split(
" ")
if s !=
""]
250 codepoint =
Codepoint(id, general_category, combining_ids)
251 table[id] = codepoint
252 all.append(codepoint)
255 for codepoint
in all:
256 if codepoint.general_category.startswith(
'L')
and \
257 len(codepoint.combining_ids) > 1:
259 charactersSet.add((codepoint.id,
261 elif args.noLigaturesExpansion
is False and is_ligature(codepoint, table):
262 charactersSet.add((codepoint.id,
263 "".join(chr(combining_codepoint.id)
264 for combining_codepoint \
267 charactersSet.add((codepoint.id,
None))
270 if not args.noLigaturesExpansion:
275 charactersList = sorted(charactersSet, key=
lambda characterPair: characterPair[0])
277 for characterPair
in charactersList:
280 if __name__ ==
"__main__":
281 parser = argparse.ArgumentParser(description=
'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
282 parser.add_argument(
"--unicode-data-file", help=
"Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=
True, dest=
'unicodeDataFilePath')
283 parser.add_argument(
"--latin-ascii-file", help=
"Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest=
'latinAsciiFilePath')
284 parser.add_argument(
"--no-ligatures-expansion", help=
"Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action=
"store_true", dest=
'noLigaturesExpansion')
285 args = parser.parse_args()
287 if args.noLigaturesExpansion
is False and args.latinAsciiFilePath
is None:
288 sys.stderr.write(
'You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
void print(const void *obj)
def bytes(source, encoding='ascii', errors='strict')
def __init__(self, id, general_category, combining_ids)
def is_ligature(codepoint, table)
def get_plain_letters(codepoint, table)
def is_plain_letter(codepoint)
def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath)
def print_record(codepoint, letter)
def get_plain_letter(codepoint, table)
def is_letter_with_marks(codepoint, table)
def is_letter(codepoint, table)
def is_mark_to_remove(codepoint)