33import xml.etree.ElementTree
as ET
35sys.stdout = codecs.getwriter(
'utf8')(sys.stdout.buffer)
41PLAIN_LETTER_RANGES = ((ord(
'a'), ord(
'z')),
54COMBINING_MARK_RANGES = ((0x0300, 0x0362),
63 if (
' ' in letter)
or (
'"' in letter):
64 letter =
'"' + letter.replace(
'"',
'""') +
'"'
65 output = chr(codepoint) +
"\t" + letter
67 output = chr(codepoint)
73 def __init__(self, id, general_category, combining_ids):
80 """Return true if this is a combining mark to remove."""
84 for begin, end
in COMBINING_MARK_RANGES:
85 if codepoint.id >= begin
and codepoint.id <= end:
91 """Return true if codepoint represents a "plain letter"."""
92 for begin, end
in PLAIN_LETTER_RANGES:
93 if codepoint.id >= begin
and codepoint.id <= end:
99 """Returns true for diacritical marks (combining codepoints)."""
100 return codepoint.general_category
in (
"Mn",
"Me",
"Mc")
104 """Returns true for letters combined with one or more marks."""
110 if len(codepoint.combining_ids) == 1
and codepoint.combining_ids[0]
in table:
114 if any(
is_mark(table[i])
for i
in codepoint.combining_ids[1:])
is False:
118 codepoint_base = codepoint.combining_ids[0]
127 """Return true for letter with or without diacritical marks."""
132 """Return the base codepoint without marks. If this codepoint has more
133 than one combining character, do a recursive lookup on the table to
134 find out its plain base letter."""
136 if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
139 return table[codepoint.combining_ids[0]]
142 assert False,
'Codepoint U+%0.2X' % codepoint.id
147 assert False,
'Codepoint U+%0.2X' % codepoint.id
151 """Return true for letters combined with letters."""
152 return all(i
in table
and is_letter(table[i], table)
for i
in codepoint.combining_ids)
155 """Return a list of plain letters from a ligature."""
157 return [
get_plain_letter(table[id], table)
for id
in codepoint.combining_ids]
161 """Parse the XML file and return a set of tuples (src, trg), where "src"
162 is the original character
and "trg" the substitute.
"""
163 charactersSet = set()
166 rulePattern = re.compile(
r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
169 transliterationTree = ET.parse(latinAsciiFilePath)
170 transliterationTreeRoot = transliterationTree.getroot()
175 blockRules = transliterationTreeRoot.findall(
"./transforms/transform/tRule")
179 rules = blockRules[0].text.splitlines()
183 matches = rulePattern.search(rule)
193 if matches
is not None:
194 src = matches.group(1)
if matches.group(1)
is not None else bytes(matches.group(2),
'UTF-8').decode(
'unicode-escape')
195 trg = matches.group(3)
if matches.group(3)
is not None else matches.group(4)
198 trg = trg.replace(
"\\'",
"'").replace(
'\\"',
'"')
202 if not src.isspace()
and not trg.isspace():
208 charactersSet.add((ord(src), trg))
214 """Returns the special cases which are not handled by other methods"""
215 charactersSet = set()
218 charactersSet.add((0x0401,
"\u0415"))
219 charactersSet.add((0x0451,
"\u0435"))
222 charactersSet.add((0x2103,
"\xb0C"))
223 charactersSet.add((0x2109,
"\xb0F"))
230 decomposition_type_pattern = re.compile(
" *<[^>]*> *")
236 charactersSet = set()
240 args.unicodeDataFilePath, mode=
'r', encoding=
'UTF-8',
241 )
as unicodeDataFile:
243 for line
in unicodeDataFile:
244 fields = line.split(
";")
247 general_category = fields[2]
248 decomposition = fields[5]
249 decomposition = re.sub(decomposition_type_pattern,
' ', decomposition)
250 id = int(fields[0], 16)
251 combining_ids = [int(s, 16)
for s
in decomposition.split(
" ")
if s !=
""]
252 codepoint =
Codepoint(id, general_category, combining_ids)
253 table[id] = codepoint
254 all.append(codepoint)
257 for codepoint
in all:
258 if codepoint.general_category.startswith(
'L')
and \
259 len(codepoint.combining_ids) > 0:
261 charactersSet.add((codepoint.id,
263 elif args.noLigaturesExpansion
is False and is_ligature(codepoint, table):
264 charactersSet.add((codepoint.id,
265 "".join(chr(combining_codepoint.id)
266 for combining_codepoint
269 charactersSet.add((codepoint.id,
None))
272 if not args.noLigaturesExpansion:
277 charactersList = sorted(charactersSet, key=
lambda characterPair: characterPair[0])
279 for characterPair
in charactersList:
283if __name__ ==
"__main__":
284 parser = argparse.ArgumentParser(description=
'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
285 parser.add_argument(
"--unicode-data-file", help=
"Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=
True, dest=
'unicodeDataFilePath')
286 parser.add_argument(
"--latin-ascii-file", help=
"Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest=
'latinAsciiFilePath')
287 parser.add_argument(
"--no-ligatures-expansion", help=
"Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action=
"store_true", dest=
'noLigaturesExpansion')
288 args = parser.parse_args()
290 if args.noLigaturesExpansion
is False and args.latinAsciiFilePath
is None:
291 sys.stderr.write(
'You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
void print(const void *obj)
def __init__(self, id, general_category, combining_ids)
def get_plain_letters(codepoint, table)
def print_record(codepoint, letter)
def is_plain_letter(codepoint)
def get_plain_letter(codepoint, table)
def is_letter_with_marks(codepoint, table)
def is_ligature(codepoint, table)
def is_letter(codepoint, table)
def is_mark_to_remove(codepoint)
def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath)